Task5 基于深度学习的文本分类2

ryluo 2020-07-31 21:50:40

由于最近没有GPU机子跑代码,就把代码先写好了,等有机子了再跑一下,所以目前不能保证所有代码都能跑通!!!

导入包

import os, logging, re, time, gc
import pandas as pd 
import fasttext
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm.autonotebook import *
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from gensim.models import FastText, Word2Vec
import tensorflow as tf
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical
from keras.initializers import *
import random as rn
import gensim

读取数据

root_path = '../data/'
train_df = pd.read_csv(os.path.join(root_path, 'train_set.csv'), sep='\t')
test_df = pd.read_csv(os.path.join(root_path, 'test_a.csv'), sep='\t')
all_data = pd.concat([train_df['text'], test_df['text']], axis=0)

训练词向量

# 做embedding 这里采用word2vec 可以换成其他例如(glove词向量)
def trian_save_word2vec(docs, embed_size=300, save_name='w2v.txt', split_char=' '):
    '''
    输入
    docs:输入的文本列表
    embed_size:embed长度
    save_name:保存的word2vec位置

    输出
    w2v:返回的模型
    '''
    input_docs = []
    for i in docs:
        input_docs.append(i.split(split_char))
    logging.basicConfig(
    format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
    w2v = Word2Vec(input_docs, size=embed_size, sg=1, window=12, seed=2020, workers=32, min_count=1, iter=10)
    w2v.wv.save_word2vec_format(save_name)
    print("w2v model done")
    # 测试词向量
    print('*************测试词向量*************')
    print(w2v[input_docs[0][0]])
    return w2v

# 这里比较慢
word_lst = list(all_data)
print(len(word_lst))
trian_save_word2vec(word_lst, save_name=os.path.join(root_path, 'w2v_300.txt'), split_char=' ')

文本预处理

### Tokenizer 序列化文本
def set_tokenizer(docs, split_char=' ', max_len=100):
    '''
    输入
    docs:文本列表
    split_char:按什么字符切割
    max_len:截取的最大长度

    输出
    X:序列化后的数据
    word_index:文本和数字对应的索引
    '''
    tokenizer = Tokenizer(lower=False, char_level=False, split=split_char)
    tokenizer.fit_on_texts(docs)
    X = tokenizer.texts_to_sequences(docs)
    maxlen = max_len
    X = pad_sequences(X, maxlen=maxlen, value=0)
    word_index=tokenizer.word_index
    return X, word_index

# 得到embedding矩阵
def get_embedding_matrix(word_index, embed_size=128, Emed_path="w2v_300.txt"):
    embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(
        Emed_path, binary=False)
    nb_words = len(word_index)+1
    embedding_matrix = np.zeros((nb_words, embed_size))
    count = 0 # 记录未出现的词
    for word, i in tqdm(word_index.items()): # word表示的是次 i表示的是对应的序号
        if i >= nb_words:
            continue
        try:
            embedding_vector = embeddings_index[word]
        except:
            embedding_vector = np.zeros(embed_size)
            count += 1
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector    
    print("null cnt",count)
    return embedding_matrix

############################
all_data_list = list(all_data)
print('开始序列化')
all_data_text, idx_all = set_tokenizer(all_data_list, split_char=' ', max_len=150) 
# 得到emb矩阵
emb = get_embedding_matrix(idx_all, embed_size=300, Emed_path=os.path.join(root_path, 'w2v_300.txt'))
label = to_categorical(train_df['label'] - 1, 13)

trn_text = all_data_text[:200000]
tst_test = all_data_text[200000:]

定义模型

TextCNN

def TextCNN(emb):
    K.clear_session()
    emb_layer = Embedding(
                        input_dim=emb.shape[0],
                        output_dim=emb.shape[1],
                        weights=[emb],
                        input_length=150,
                        trainable=False)

    seq = Input(shape=(150, ))

    x = emb_layer(seq)
    sdrop = SpatialDropout1D(rate=0.2)
    x = sdrop(x)

    convs = []
    filter_sizes = [2, 3, 4, 5]
    for fsz in filter_sizes:
        x_ = Conv1D(filters=100, kernel_size=fsz, activation='relu')(x)
        x_ = MaxPooling1D(maxlen - fsz + 1)(x_)
        x_ = Flatten()(x_)
        convs.append(x_)

    merge = concatenate(convs, axis=1)
    pred = Dropout(0.5)(merge)
    pred = Dense(128, activation='relu')(pred)
    pred = Dense(units=13, activation='softmax')(pred)
    model = Model(inputs=[seq], outputs=pred)

    return model

TextLSTM

def TextLSTM(emb):
    K.clear_session()
    emb_layer = Embedding(
                        input_dim=emb.shape[0],
                        output_dim=emb.shape[1],
                        weights=[emb],
                        input_length=150,
                        trainable=False)

    seq = Input(shape=(150, ))

    x = emb_layer(seq)
    sdrop = SpatialDropout1D(rate=0.2)
    x = sdrop(x)

    x = Dropout(0.2)(Bidirectional(CuDNNLSTM(200, return_sequences=True))(x))
    semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
    merged_max = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
    merged_avg = Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,))(semantic)

    x = concatenate([merged_max, merged_avg])
    x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
    x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
    pred = Dense(13, activation='softmax')(x)

    model = Model(inputs=[seq], outputs=pred)
    return model

TextHAN

def TextLSTM(emb):
    K.clear_session()
    emb_layer = Embedding(
                        input_dim=emb.shape[0],
                        output_dim=emb.shape[1],
                        weights=[emb],
                        input_length=150,
                        trainable=False)

    seq = Input(shape=(150, ))

    x = emb_layer(seq)
    sdrop = SpatialDropout1D(rate=0.2)
    x = sdrop(x)

    x = Dropout(0.2)(Bidirectional(CuDNNLSTM(200, return_sequences=True))(x))
    semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
    merged_max = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
    merged_avg = Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,))(semantic)

    x = concatenate([merged_max, merged_avg])
    x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(1000)(x))))
    x = Activation(activation="relu")(BatchNormalization()(Dense(500)(x)))
    pred = Dense(13, activation='softmax')(x)

    model = Model(inputs=[seq], outputs=pred)
    return model

模型训练

skf = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
sub = np.zeros((tst_text.shape[0], 13))
oof_pred = np.zeros((trn_text.shape[0], 13))
score = []
count = 0
if not os.path.exists("model"):
    os.mkdir("model")

for i, (train_index, test_index) in enumerate(skf.split(trn_text, label)):
    print("FOLD | ", count+1)
    print("###"*35)
    gc.collect()
    filepath = os.path.join(root_path, 'model/nn_v1_%d.h5' % count)
    checkpoint = ModelCheckpoint(
        filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max', save_weights_only=True)
    reduce_lr = ReduceLROnPlateau(
        monitor='val_accuracy', factor=0.5, patience=3, min_lr=0.0001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_accuracy', min_delta=0.0001, patience=5, verbose=1, mode='max')
    callbacks = [checkpoint, reduce_lr, earlystopping]
    model = TextCNN(emb)
#     model = TextLSTM(emb)
#     model = TextHAN(emb)

    if count==0:model_age.summary()
    x_tr, x_va = np.array(trn_text)[train_index], np.array(trn_text)[test_index]    
    y_tr, y_va = label[train_index], label[test_index]

    hist = model_age.fit(x_tr, y_tr, batch_size=128, epochs=50, 
                         validation_data=(x_va, y_va),
                         callbacks=callbacks, verbose=1, shuffle=True)

    model.load_weights(filepath)
    oof_pred[test_index] = model.predict(x_va, batch_size=128, verbose=1)
    sub += model.predict(tst_text, batch_size=128, verbose=1) / skf.n_splits
    score.append(np.max(hist.history['val_accuracy']))
    count += 1
    break #只进行一折,多折交叉内存不足
print('acc:', np.mean(score))

模型预测

sub = pd.read_csv(os.path.join(root_path, 'submit.csv'))
sub['label'] = sub.argmax(1) + 1
sub.head()