机器学习应用示例：多输出-多分类问题

本文使用的数据是一个多输出（Multioutput）问题，每个数据有9个属性的输出，每个输出下又细分为多个类别（Multiclass），需要预测的是每个数据在这9个输出下的具体类别（注：数据在每个输出下只能有一个类别）。

详细的数据介绍和问题描述可参考此链接。

一、数据拆分

1、建立从数据中拆分测试集的函数

该函数使用数据的二维标签矩阵[shape: (数据数，9个属性的类别总数)]，矩阵中的每个值为0或1。该函数保证每个属性的每个类别下都有一定数量的数据分入测试集中。

import numpy as np
import pandas as pd

### Takes a label matrix 'y' and returns the indices for test set
### The number of test set is 'size' if 'size' > 1 or 'size' * len(y) if 'size' <= 1
### The test set is guaranteed to have > 'min_count' of each label.
def multilabel_sample(y, size=1000, min_count=5, seed=None):
    if (np.unique(y).astype(int) != np.array([0, 1])).any():
        raise ValueError('multilabel_sample only works with binary indicator matrices')
    if (y.sum(axis=0) < min_count).any():
        raise ValueError('Some labels do not have enough examples. Change min_count if necessary.')
    if size <= 1: size = np.floor(y.shape[0] * size)      
    if y.shape[1] * min_count > size: size = y.shape[1] * min_count
    if isinstance(y, pd.DataFrame): y = y.values
    choices = np.arange(y.shape[0])
    rng = np.random.RandomState(seed if seed is not None else np.random.randint(low=1))
    # guarantee > min_count of each label
    sample_idxs = np.array([], dtype=choices.dtype)
    for j in range(y.shape[1]): 
        label_choices = choices[y[:, j] == 1]
        label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
        sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])
    sample_idxs = np.unique(sample_idxs)
    # just random sample from remaining data
    remaining_count = int(size - sample_idxs.shape[0]) 
    remaining_choices = np.setdiff1d(choices, sample_idxs)
    remaining_sampled = rng.choice(remaining_choices, size=remaining_count, replace=False)
    return np.concatenate([sample_idxs, remaining_sampled])

import numpy as np

import pandas as pd

### Takes a label matrix 'y' and returns the indices for test set

### The number of test set is 'size' if 'size' > 1 or 'size' * len(y) if 'size' <= 1

### The test set is guaranteed to have > 'min_count' of each label.

def multilabel_sample(y, size=1000, min_count=5, seed=None):

if (np.unique(y).astype(int) != np.array([0, 1])).any():

raise ValueError('multilabel_sample only works with binary indicator matrices')

if (y.sum(axis=0) < min_count).any():

raise ValueError('Some labels do not have enough examples. Change min_count if necessary.')

if size <= 1: size = np.floor(y.shape[0] * size)

if y.shape[1] * min_count > size: size = y.shape[1] * min_count

if isinstance(y, pd.DataFrame): y = y.values

choices = np.arange(y.shape[0])

rng = np.random.RandomState(seed if seed is not None else np.random.randint(low=1))

# guarantee > min_count of each label

sample_idxs = np.array([], dtype=choices.dtype)

for j in range(y.shape[1]):

label_choices = choices[y[:, j] == 1]

label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)

sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])

sample_idxs = np.unique(sample_idxs)

# just random sample from remaining data

remaining_count = int(size - sample_idxs.shape[0])

remaining_choices = np.setdiff1d(choices, sample_idxs)

remaining_sampled = rng.choice(remaining_choices, size=remaining_count, replace=False)

return np.concatenate([sample_idxs, remaining_sampled])

2、建立测试集的评价标准

### 数据共对应9个属性的输出，每个输出又有不同的类别个数
LABEL_INDICES = [range(0, 37), range(37, 48), range(48, 51), range(51, 76), range(76, 79), \
                 range(79, 82), range(82, 87), range(87, 96), range(96, 104)]
### Multioutput-multiclass logarithmic loss metric
### predicted, actual: 2D numpy array with shape (数据数，9个属性的类别总数)
def multi_multi_log_loss(predicted, actual, label_column_indices=LABEL_INDICES, eps=1e-15):
    output_scores = np.ones(len(label_column_indices), dtype=np.float64) #存储每个输出的log loss
    # calculate log loss for each set of classes that belong to one output
    for k, this_output_indices in enumerate(label_column_indices):
        preds_k = predicted[:, this_output_indices].astype(np.float64) #get just the columns for this output
        preds_k /= np.clip(preds_k.sum(axis=1).reshape(-1, 1), eps, np.inf) #normalize predicted probabilities
        y_hats = np.clip(preds_k, eps, 1 - eps) #shrink predictions
        actual_k = actual[:, this_output_indices]
        sum_logs = np.sum(actual_k * np.log(y_hats))
        output_scores[k] = (-1.0 / actual.shape[0]) * sum_logs
    return np.average(output_scores)

### 数据共对应9个属性的输出，每个输出又有不同的类别个数

LABEL_INDICES = [range(0, 37), range(37, 48), range(48, 51), range(51, 76), range(76, 79), \

range(79, 82), range(82, 87), range(87, 96), range(96, 104)]

### Multioutput-multiclass logarithmic loss metric

### predicted, actual: 2D numpy array with shape (数据数，9个属性的类别总数)

def multi_multi_log_loss(predicted, actual, label_column_indices=LABEL_INDICES, eps=1e-15):

output_scores = np.ones(len(label_column_indices), dtype=np.float64) #存储每个输出的log loss

# calculate log loss for each set of classes that belong to one output

for k, this_output_indices in enumerate(label_column_indices):

preds_k = predicted[:, this_output_indices].astype(np.float64) #get just the columns for this output

preds_k /= np.clip(preds_k.sum(axis=1).reshape(-1, 1), eps, np.inf) #normalize predicted probabilities

y_hats = np.clip(preds_k, eps, 1 - eps) #shrink predictions

actual_k = actual[:, this_output_indices]

sum_logs = np.sum(actual_k * np.log(y_hats))

output_scores[k] = (-1.0 / actual.shape[0]) * sum_logs

return np.average(output_scores)

3、拆分训练集和测试集

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from collections import defaultdict

def multilabel_train_test_split(X, Y_encoder, Y_dummies, size, min_count=5, seed=None):
    assert Y_encoder.shape[0]==Y_dummies.shape[0]
    index = np.arange(Y_dummies.shape[0])
    test_set_idxs = multilabel_sample(Y_dummies, size=size, min_count=min_count, seed=seed)
    test_set_mask = np.in1d(index, test_set_idxs)
    train_set_mask = ~test_set_mask
    return (X[train_set_mask], X[test_set_mask], Y_encoder[train_set_mask], Y_encoder[test_set_mask], Y_dummies[train_set_mask], Y_dummies[test_set_mask])

### 读取数据
df = pd.read_csv('TrainingData.csv', index_col=0)
OUTPUTS = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', \
          'Object_Type', 'Pre_K', 'Operating_Status']
TRAIN_COLUMNS = [c for c in df.columns if c not in OUTPUTS]
NUMERIC_COLUMNS = ['FTE', 'Total']
### 转换标签
# see https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
d = defaultdict(LabelEncoder)
Y_encoder = df[OUTPUTS].apply(lambda x: d[x.name].fit_transform(x), axis=0) #encoding the variable, (num_sample,9)
ohe = OneHotEncoder(sparse=False)
Y_dummies = ohe.fit_transform(Y_encoder) #(num_sample,104)
### 拆分数据
# y_train, y_test: (num_train,9), (num_test,9)
# y_train_dummies, y_test_dummies: (num_train,104), (num_test,104)
X_train, X_test, y_train, y_test, y_train_dummies, y_test_dummies = multilabel_train_test_split(df[TRAIN_COLUMNS], Y_encoder, Y_dummies, size=0.2, seed=123)

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from collections import defaultdict

def multilabel_train_test_split(X, Y_encoder, Y_dummies, size, min_count=5, seed=None):

assert Y_encoder.shape[0]==Y_dummies.shape[0]

index = np.arange(Y_dummies.shape[0])

test_set_idxs = multilabel_sample(Y_dummies, size=size, min_count=min_count, seed=seed)

test_set_mask = np.in1d(index, test_set_idxs)

train_set_mask = ~test_set_mask

return (X[train_set_mask], X[test_set_mask], Y_encoder[train_set_mask], Y_encoder[test_set_mask], Y_dummies[train_set_mask], Y_dummies[test_set_mask])

### 读取数据

df = pd.read_csv('TrainingData.csv', index_col=0)

OUTPUTS = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', \

'Object_Type', 'Pre_K', 'Operating_Status']

TRAIN_COLUMNS = [c for c in df.columns if c not in OUTPUTS]

NUMERIC_COLUMNS = ['FTE', 'Total']

### 转换标签

# see https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

d = defaultdict(LabelEncoder)

Y_encoder = df[OUTPUTS].apply(lambda x: d[x.name].fit_transform(x), axis=0) #encoding the variable, (num_sample,9)

ohe = OneHotEncoder(sparse=False)

Y_dummies = ohe.fit_transform(Y_encoder) #(num_sample,104)

### 拆分数据

# y_train, y_test: (num_train,9), (num_test,9)

# y_train_dummies, y_test_dummies: (num_train,104), (num_test,104)

X_train, X_test, y_train, y_test, y_train_dummies, y_test_dummies = multilabel_train_test_split(df[TRAIN_COLUMNS], Y_encoder, Y_dummies, size=0.2, seed=123)

二、特征工程

1、将每个数据的所有文本特征整合成一个字符串，提取所有字符串的一元和二元词组，计算每个字符串中各一元和二元词组出现的次数

from sklearn.preprocessing import FunctionTransformer
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + OUTPUTS):   
    to_drop = set(to_drop) & set(data_frame.columns.tolist()) #Drop non-text columns that are in the data
    text_data = data_frame.drop(to_drop, axis=1)
    text_data.fillna('', inplace=True)
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: ' '.join(x), axis=1)
get_text_data = FunctionTransformer(combine_text_columns, validate=False)

### Alternative Option: HashingVectorizer(ngram_range=(1, 2), n_features=...)  
### HashingVectorizer将字符串中每个一元和二元词组映射为一个哈希值（多个词组可对应同一个值），计算每个哈希值出现的次数
from sklearn.feature_extraction.text import CountVectorizer
text_vectorizer = CountVectorizer(ngram_range=(1, 2))

from sklearn.preprocessing import FunctionTransformer

def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + OUTPUTS):

to_drop = set(to_drop) & set(data_frame.columns.tolist()) #Drop non-text columns that are in the data

text_data = data_frame.drop(to_drop, axis=1)

text_data.fillna('', inplace=True)

# Join all text items in a row that have a space in between

return text_data.apply(lambda x: ' '.join(x), axis=1)

get_text_data = FunctionTransformer(combine_text_columns, validate=False)

### Alternative Option: HashingVectorizer(ngram_range=(1, 2), n_features=...)

### HashingVectorizer将字符串中每个一元和二元词组映射为一个哈希值（多个词组可对应同一个值），计算每个哈希值出现的次数

from sklearn.feature_extraction.text import CountVectorizer

text_vectorizer = CountVectorizer(ngram_range=(1, 2))

2、使用卡方检验对新生成的文本特征进行选择，卡方检验的步骤如下：

计算观测矩阵 $O$ ， $O_{ij}$ 表示具有第 $i$ 个标签的所有数据的第 $j$ 个特征之和
计算期望矩阵 $E$ ， $E_{ij}$ 表示若按照第 $i$ 个标签在数据中所占比例进行分配，第 $i$ 个标签对应的第 $j$ 个特征之和
计算每个特征对应的卡方统计量，在该特征与数据标签无关的假设条件下该统计量服从自由度为(总标签数 $-1$ )的卡方分布
特征对应的卡方统计量越大，该特征在模型训练和预测时就越重要

### observation O = np.dot(y.T, X)
### expectation E = np.dot(y.mean(axis=0).T, X.sum(axis=0))
### X的维数为(num_sample, num_feature)且其中的元素>=0, y的维数为(num_sample, num_label)且其中的元素为0或1
### 卡方统计量 = ((O-E)**2/E).sum(axis=0), 维数为(num_feature,)
from sklearn.feature_selection import chi2, SelectKBest
chi_k = 300 #选出300个特征
text_feature_selector = SelectKBest(chi2, k=chi_k)

### observation O = np.dot(y.T, X)

### expectation E = np.dot(y.mean(axis=0).T, X.sum(axis=0))

### X的维数为(num_sample, num_feature)且其中的元素>=0, y的维数为(num_sample, num_label)且其中的元素为0或1

### 卡方统计量 = ((O-E)**2/E).sum(axis=0), 维数为(num_feature,)

from sklearn.feature_selection import chi2, SelectKBest

chi_k = 300 #选出300个特征

text_feature_selector = SelectKBest(chi2, k=chi_k)

3、合并数值和文本特征，并生成新的特征，建立数据管道

from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import PolynomialFeatures, MaxAbsScaler
### Feature Union
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)
num_text_feature = FeatureUnion([('numeric_features', Pipeline([('selector', get_numeric_data), ('imputer', SimpleImputer())])), \
                                 ('text_features', Pipeline([('selector', get_text_data), ('vectorizer', text_vectorizer), ('dim_red', text_feature_selector)]))])
### Feature Interaction
inter = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
### Data Pipeline
data_pl = Pipeline([('union', num_text_feature), ('inter', inter), ('scale', MaxAbsScaler())])
X_train_proc = data_pl.fit_transform(X_train, y_train_dummies)
X_test_proc = data_pl.transform(X_test)

from sklearn.impute import SimpleImputer

from sklearn.pipeline import FeatureUnion, Pipeline

from sklearn.preprocessing import PolynomialFeatures, MaxAbsScaler

### Feature Union

get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

num_text_feature = FeatureUnion([('numeric_features', Pipeline([('selector', get_numeric_data), ('imputer', SimpleImputer())])), \

('text_features', Pipeline([('selector', get_text_data), ('vectorizer', text_vectorizer), ('dim_red', text_feature_selector)]))])

### Feature Interaction

inter = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

### Data Pipeline

data_pl = Pipeline([('union', num_text_feature), ('inter', inter), ('scale', MaxAbsScaler())])

X_train_proc = data_pl.fit_transform(X_train, y_train_dummies)

X_test_proc = data_pl.transform(X_test)

三、建立分类模型

该问题共有9个输出，每个输出都是一个多分类问题，共计有104个类别。可以使用三种模型来解决这个问题：

1、将其看作104个二分类问题，对每个类别（数据属于该类别则标签记为1，不属于记为0）分别进行训练，这里以Logistic算法为例进行说明。

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

estimator_1 = OneVsRestClassifier(LogisticRegression())
estimator_1.fit(X_train_proc, y_train_dummies)
predictions = estimator_1.predict_proba(X_test_proc) #(num_test, 104)
# Test Logloss: 0.6632372365168943
print("Test Logloss: {}".format(multi_multi_log_loss(predictions, y_test_dummies)))

from sklearn.linear_model import LogisticRegression

from sklearn.multiclass import OneVsRestClassifier

estimator_1 = OneVsRestClassifier(LogisticRegression())

estimator_1.fit(X_train_proc, y_train_dummies)

predictions = estimator_1.predict_proba(X_test_proc) #(num_test, 104)

# Test Logloss: 0.6632372365168943

print("Test Logloss: {}".format(multi_multi_log_loss(predictions, y_test_dummies)))

2、将其看作9个多分类问题，对每个输出分别进行训练，这里以随机森林算法为例。这里随机森林的参数只是随意设置的，要使算法取得更好的效果还需要进一步优化这些超参数。

from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

estimator_2 = MultiOutputClassifier(RandomForestClassifier(random_state=42, n_estimators=50, max_depth=25, max_features="log2"))
estimator_2.fit(X_train_proc, y_train)
predictions_list = estimator_2.predict_proba(X_test_proc) #a list of 9 arrays
predictions = np.hstack(predictions_list) #(num_test, 104)
print("Test Logloss: {}".format(multi_multi_log_loss(predictions, y_test_dummies)))

from sklearn.ensemble import RandomForestClassifier

from sklearn.multioutput import MultiOutputClassifier

estimator_2 = MultiOutputClassifier(RandomForestClassifier(random_state=42, n_estimators=50, max_depth=25, max_features="log2"))

estimator_2.fit(X_train_proc, y_train)

predictions_list = estimator_2.predict_proba(X_test_proc) #a list of 9 arrays

predictions = np.hstack(predictions_list) #(num_test, 104)

print("Test Logloss: {}".format(multi_multi_log_loss(predictions, y_test_dummies)))

3、使用可以直接处理多输出-多分类问题的算法进行训练，这里仍以随机森林算法为例。需要注意的是随机森林直接解决多输出-多分类问题时与随机森林解决多分类问题时算法细节是不同的，最主要的区别在于构建决策树时分裂标准不一样，多输出-多分类问题需要计算分裂前后每个输出下的gini系数，再对所有输出下的gini系数进行平均，选取使得该平均值减小最多的分裂方式，具体可参考此链接。

estimator_3 = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=25, max_features="log2")
estimator_3.fit(X_train_proc, y_train)
predictions_list = estimator_3.predict_proba(X_test_proc) #a list of 9 arrays
predictions = np.hstack(predictions_list) #(num_test, 104)
print("Test Logloss: {}".format(multi_multi_log_loss(predictions, y_test_dummies)))

estimator_3 = RandomForestClassifier(random_state=42, n_estimators=50, max_depth=25, max_features="log2")

estimator_3.fit(X_train_proc, y_train)

predictions_list = estimator_3.predict_proba(X_test_proc) #a list of 9 arrays

predictions = np.hstack(predictions_list) #(num_test, 104)

print("Test Logloss: {}".format(multi_multi_log_loss(predictions, y_test_dummies)))

Leave a Comment Cancel Reply