From c2418159593e6208d3849cf2dfb49abc6a6638ea Mon Sep 17 00:00:00 2001 From: lidunwei <13564180096@163.com> Date: Wed, 7 Oct 2020 23:11:01 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0word2vec=E8=AE=AD=E7=BB=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/workspace.xml | 24 ++++++++++------ src/medical_word2vec.py | 64 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 9 deletions(-) create mode 100644 src/medical_word2vec.py diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 0e597e6..394f1d6 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -20,6 +20,7 @@ + @@ -51,9 +52,14 @@ - + + + + + + @@ -164,22 +170,22 @@ - + - - + + - - + + - - + + - + diff --git a/src/medical_word2vec.py b/src/medical_word2vec.py new file mode 100644 index 0000000..ff91b20 --- /dev/null +++ b/src/medical_word2vec.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import jieba +import warnings +import logging +import os.path +import sys +import multiprocessing + +from gensim.models import Word2Vec +from gensim.models.word2vec import LineSentence +filePath = 'corpus_1.txt' +fileSegWordDonePath = 'corpusSegDone_1.txt' +warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') + + +# 打印中文列表 +def PrintListChinese(list): + for i in range(len(list)): + print(list[i]) + + +fileTrainRead = [] +with open(filePath, 'r') as fileTrainRaw: + for line in fileTrainRaw: # 按行读取文件 + fileTrainRead.append(line) + +# jieba分词后保存在列表中 +fileTrainSeg = [] +for i in range(len(fileTrainRead)): + fileTrainSeg.append([' '.join(list(jieba.cut(fileTrainRead[i][9:-11], cut_all=False)))]) + if i % 100 == 0: + print(i) + +# 保存分词结果到文件中 +with open(fileSegWordDonePath, 'w', encoding='utf-8') as fW: + for i in range(len(fileTrainSeg)): + fW.write(fileTrainSeg[i][0]) + fW.write('\n') + +""" +gensim word2vec获取词向量 +""" + +if __name__ == '__main__': + program = os.path.basename(sys.argv[0]) # 读取当前文件的文件名 + logger = logging.getLogger(program) + logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO) + logger.info("running %s" % ' '.join(sys.argv)) + + # inp为输入语料, outp1为输出模型, outp2为vector格式的模型 + inp = 'corpusSegDone_1.txt' + out_model = 'corpusSegDone_1.model' + out_vector = 'corpusSegDone_1.vector' + + # 训练skip-gram模型 + model = Word2Vec(LineSentence(inp), size=50, window=5, min_count=5, + workers=multiprocessing.cpu_count()) + + # 保存模型 + model.save(out_model) + # 保存词向量 + model.wv.save_word2vec_format(out_vector, binary=False)