Task3 基于机器学习的文本分类

ryluo 2020-07-25 20:16:37

加载包及读取数据

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb

data_path = './data/NLP_data_list_0715.csv'
trn_csv = './data/train_set.csv'
tst_csv = './data/test_a.csv'
sub_csv = './data/test_a_sample_submit.csv'

trn_data = pd.read_csv(trn_csv, sep='\t')
tst_data = pd.read_csv(tst_csv)
sub_data = pd.read_csv(sub_csv)

TF-IDF特征

# vectorizer = CountVectorizer(max_features=3000)
trn_tst_text = pd.concat([trn_data['text'], tst_data['text']])
tfidf = TfidfVectorizer(min_df=30, ngram_range=(1,3), max_features=3000)
tfidf_feats = tfidf.fit_transform(trn_tst_text)

trn_feats = tfidf_feats[:200000]
tst_feats = tfidf_feats[200000:]
trn_label = trn_data['label']

RidgeClassifier

# RidgeClassifier
kfold = KFold(n_splits=5, shuffle=True, random_state=2020)
model1 = RidgeClassifier()  # 该模型没有predict_proba
test_result1 = np.zeros((tst_data.shape[0],))
oof = np.zeros(trn_feats.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(kfold.split(trn_feats, trn_label)):
    print('Fold:', fold_)
    trn_x, trn_y = trn_feats[trn_idx], trn_label[trn_idx]
    val_x, val_y = trn_feats[val_idx], trn_label[val_idx]

    model1.fit(trn_x, trn_y)

    oof[val_idx] = model1.predict(val_x)
    test_result1 += model1.predict(tst_feats) / 5 

print('5 folds:',f1_score(trn_label, oof, average='macro'))
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
5 folds: 0.8957887045122883

LogisticRegression

# LogisticRegression
kfold = KFold(n_splits=5, shuffle=True, random_state=2020)
model2 = LogisticRegression(C=4, n_jobs=48)
test_result2 = np.zeros((tst_data.shape[0],))
oof = np.zeros(trn_feats.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(kfold.split(trn_feats, trn_label)):
    print('Fold:', fold_)
    trn_x, trn_y = trn_feats[trn_idx], trn_label[trn_idx]
    val_x, val_y = trn_feats[val_idx], trn_label[val_idx]

    model2.fit(trn_x, trn_y)

    oof[val_idx] = model2.predict(val_x)
    test_result2 += model2.predict(tst_feats) / 5 

print('5 folds:',f1_score(trn_label, oof, average='macro'))
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
5 folds: 0.9189422039775225

MultinomialNB

# MultinomialNB
kfold = KFold(n_splits=5, shuffle=True, random_state=2020)
model3 = MultinomialNB()
test_result3 = np.zeros((tst_data.shape[0],))
oof = np.zeros(trn_feats.shape[0])

# for fold_, (trn_idx, val_idx) in enumerate(kfold.split(trn_feats, trn_label)):
#     print('Fold:', fold_)
#     trn_x, trn_y = trn_feats[trn_idx], trn_label[trn_idx]
#     val_x, val_y = trn_feats[val_idx], trn_label[val_idx]

#     model3.fit(trn_x, trn_y)

#     oof[val_idx] = model3.predict(val_x)
#     test_result3 += model3.predict(tst_feats) / 5 

# print('5 folds:',f1_score(trn_label, oof, average='macro'))


x_trn, x_val, y_trn, y_val = train_test_split(trn_feats, trn_label, test_size=0.2)

model3.fit(x_trn, y_trn)
oof = model3.predict(x_val)

test_result3 = model3.predict(tst_feats)

print(f1_score(y_val, oof, average='macro'))
0.8323223344693526

BernoulliNB

# BernoulliNB
kfold = KFold(n_splits=5, shuffle=True, random_state=2020)
model4 = BernoulliNB()
test_result4 = np.zeros((tst_data.shape[0],))
oof = np.zeros(trn_feats.shape[0])

# for fold_, (trn_idx, val_idx) in enumerate(kfold.split(trn_feats, trn_label)):
#     print('Fold:', fold_)
#     trn_x, trn_y = trn_feats[trn_idx], trn_label[trn_idx]
#     val_x, val_y = trn_feats[val_idx], trn_label[val_idx]

#     model4.fit(trn_x, trn_y)

#     oof[val_idx] = model4.predict(val_x)
#     test_result4 += model4.predict(tst_feats) / 5 

# print('5 folds:',f1_score(trn_label, oof, average='macro'))

x_trn, x_val, y_trn, y_val = train_test_split(trn_feats, trn_label, test_size=0.2)

model4.fit(x_trn, y_trn)
oof = model4.predict(x_val)

test_result4 = model4.predict(tst_feats)

print(f1_score(y_val, oof, average='macro'))
0.6321408362720705

SGDClassifier

# SGDClassifier
kfold = KFold(n_splits=5, shuffle=True, random_state=2020)
model5 = SGDClassifier()
test_result5 = np.zeros((tst_data.shape[0],))
oof = np.zeros(trn_feats.shape[0])

# for fold_, (trn_idx, val_idx) in enumerate(kfold.split(trn_feats, trn_label)):
#     print('Fold:', fold_)
#     trn_x, trn_y = trn_feats[trn_idx], trn_label[trn_idx]
#     val_x, val_y = trn_feats[val_idx], trn_label[val_idx]

#     model5.fit(trn_x, trn_y)

#     oof[val_idx] = model5.predict(val_x)
#     test_result5 += model5.predict(tst_feats) / 5 

# print('5 folds:',f1_score(trn_label, oof, average='macro'))

x_trn, x_val, y_trn, y_val = train_test_split(trn_feats, trn_label, test_size=0.2)

model5.fit(x_trn, y_trn)
oof = model5.predict(x_val)

test_result5 = model5.predict(tst_feats)

print(f1_score(y_val, oof, average='macro'))
0.894896164771141

PassiveAggressiveClassifier

# PassiveAggressiveClassifier
kfold = KFold(n_splits=5, shuffle=True, random_state=2020)
model6 = PassiveAggressiveClassifier()
test_result6 = np.zeros((tst_data.shape[0],))
oof = np.zeros(trn_feats.shape[0])

# for fold_, (trn_idx, val_idx) in enumerate(kfold.split(trn_feats, trn_label)):
#     print('Fold:', fold_)
#     trn_x, trn_y = trn_feats[trn_idx], trn_label[trn_idx]
#     val_x, val_y = trn_feats[val_idx], trn_label[val_idx]

#     model5.fit(trn_x, trn_y)

#     oof[val_idx] = model6.predict(val_x)
#     test_result6 += model6.predict(tst_feats) / 5 

# print('5 folds:',f1_score(trn_label, oof, average='macro'))

x_trn, x_val, y_trn, y_val = train_test_split(trn_feats, trn_label, test_size=0.2)

model6.fit(x_trn, y_trn)
oof = model6.predict(x_val)

test_result6 = model6.predict(tst_feats)

print(f1_score(y_val, oof, average='macro'))
0.9168828495905037

RF

model7 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                                max_depth=None, max_features='auto', max_leaf_nodes=None,
                                min_samples_leaf=25, min_samples_split=2,
                                min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=48,
                                oob_score=False, random_state=None, verbose=0,
                                warm_start=False)

x_trn, x_val, y_trn, y_val = train_test_split(trn_feats, trn_label, test_size=0.2)

model7.fit(x_trn, y_trn)
oof = model7.predict(x_val)

test_result7 = model7.predict(tst_feats)

print(f1_score(y_val, oof, average='macro'))
0.8236463631896516

XGB

params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  # 多分类的问题
    'num_class': 10,               # 类别数,与 multisoftmax 并用
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
    'max_depth': 12,               # 构建树的深度,越大越容易过拟合
    'lambda': 2,                   # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
    'subsample': 0.7,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
    'silent': 1,                   # 设置成1则没有运行信息输出,最好是设置为0.
    'eta': 0.007,                  # 如同学习率
    'seed': 1000,
    'nthread': 4,                  # cpu 线程数
}

x_train_, x_valid_, y_train_, y_valid_ = train_test_split(trn_feats, trn_label, test_size=0.2)
X_test = tst_feats

model = XGBClassifier(param)
model.fit(x_train_,
          y_train_, 
          eval_set=[(x_valid_, y_valid_)],
          eval_metric=['mlogloss'], 
          early_stopping_rounds=10,  # 连续N次分值不再优化则提前停止 
          verbose=10)

#  mode evaluation
val_result, val_proba = model.predict(x_valid_), self.model.predict_proba(x_valid_)
# val_acc = accuracy_score(self.y, train_result)
val_auc = f1_score(y_valid_, val_proba, average='macro')

print(f1_score(y_valid_, val_result, average='macro'))
0.94616875 0.9190890434012762

结论:

  1. RidgeClassifier(0.8957887045122883)
  2. LogisticRegression(0.9189422039775225)
  3. MultinomialNB(0.8323223344693526)
  4. BernoulliNB(0.6321408362720705)
  5. SGDClassifier(0.894896164771141)
  6. PassiveAggressiveClassifier(0.9168828495905037)
  7. RF(0.8236463631896516)
  8. XGB(0.9190890434012762)