Task4 基于深度学习的文本分类1

ryluo 2020-07-27 20:03:16

导包

import pandas as pd 
import fasttext
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import os

root_path = '../TianChi/NLP/data/'

读取数据并加上前缀(fasttext模型需要的格式)

train_df = pd.read_csv(os.path.join(root_path, 'train_set.csv'), sep='\t')
test_df = pd.read_csv(os.path.join(root_path, 'test_a.csv'), sep='\t')

模型训练

model = fasttext.train_supervised(os.path.join(root_path, 'train.csv'), lr=1.0, wordNgrams=2, 
                                  verbose=2, minCount=1, epoch=25, loss="hs")

模型预测

test_pred = [model.predict(x)[0][0].split('__')[-1] for x in test_df['text']]

生成提交文件

sub = pd.read_csv(os.path.join(root_path, 'test_a_sample_submit.csv'))
sub['label'] = test_pred
sub.head()

线上结果0.9145

fasttext自动调参的模型训练

# 生成训练和验证集,用于自动调参
train_df[['text','label_ft']][:160000].to_csv(os.path.join(root_path, 'train.csv'), index=None, header=None, sep='\t')
train_df[['text','label_ft']][160000:].to_csv(os.path.join(root_path, 'valid.csv'), index=None, header=None, sep='\t')

# 自动调参可能设置的时间比较短,线上效果并不是很理想
model = fasttext.train_supervised(input=os.path.join(root_path, 'train.csv'), 
                                  autotuneValidationFile=os.path.join(root_path, 'valid.csv'), # 调参时的验证集
                                  autotuneDuration=600) # 设置调参时间

# 模型测试与上面的模型测试是一样的