镜像自地址
https://gitee.com/medical-alliance/Medical-nlp.git
已同步 2025-12-06 01:16:47 +00:00
增加doc2vec
这个提交包含在:
3
.idea/.gitignore
自动生成的
vendored
普通文件
3
.idea/.gitignore
自动生成的
vendored
普通文件
@@ -0,0 +1,3 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
8
.idea/Medical-nlp.iml
自动生成的
普通文件
8
.idea/Medical-nlp.iml
自动生成的
普通文件
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
自动生成的
普通文件
6
.idea/inspectionProfiles/profiles_settings.xml
自动生成的
普通文件
@@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
4
.idea/misc.xml
自动生成的
普通文件
4
.idea/misc.xml
自动生成的
普通文件
@@ -0,0 +1,4 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
自动生成的
普通文件
8
.idea/modules.xml
自动生成的
普通文件
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/Medical-nlp.iml" filepath="$PROJECT_DIR$/.idea/Medical-nlp.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/vcs.xml
自动生成的
普通文件
6
.idea/vcs.xml
自动生成的
普通文件
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
gensim
|
gensim
|
||||||
flask
|
flask
|
||||||
flasgger
|
flasgger
|
||||||
|
jieba
|
||||||
60
src/doc2vec_data.py
普通文件
60
src/doc2vec_data.py
普通文件
@@ -0,0 +1,60 @@
|
|||||||
|
import gensim
|
||||||
|
import numpy as np
|
||||||
|
import jieba
|
||||||
|
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
|
||||||
|
|
||||||
|
|
||||||
|
def jieba_tokenize(text):
|
||||||
|
"""
|
||||||
|
文本分词
|
||||||
|
:param text: 文本
|
||||||
|
:return: 分词list
|
||||||
|
"""
|
||||||
|
return jieba.lcut(text)
|
||||||
|
|
||||||
|
|
||||||
|
def get_datasest():
|
||||||
|
"""
|
||||||
|
获取doc2vec文本训练数据集
|
||||||
|
:return: 文本分词list,及id
|
||||||
|
"""
|
||||||
|
x_train = []
|
||||||
|
for file in open('toutiao_cat_data.txt', encoding='utf8'):
|
||||||
|
file = file.split('_!_')
|
||||||
|
if len(file) > 3:
|
||||||
|
document = TaggededDocument(file[3], tags=[int(file[1])])
|
||||||
|
x_train.append(document)
|
||||||
|
return x_train
|
||||||
|
|
||||||
|
|
||||||
|
def train(x_train, size=2000, epoch_num=10):
|
||||||
|
model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
|
||||||
|
model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=epoch_num)
|
||||||
|
model_dm.save('model')
|
||||||
|
return model_dm
|
||||||
|
|
||||||
|
|
||||||
|
def getVecs(model, corpus, size):
|
||||||
|
vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, size)) for z in corpus]
|
||||||
|
return np.concatenate(vecs)
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
model_dm = Doc2Vec.load("model")
|
||||||
|
test_text = ['想换个', '30', '万左右', '的', '车', ',', '现在', '开科鲁兹', ',', '有', '什么', '好', '推荐', '的', '?']
|
||||||
|
inferred_vector_dm = model_dm.infer_vector(test_text)
|
||||||
|
sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
|
||||||
|
return sims
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
x_train = get_datasest()
|
||||||
|
model_dm = train(x_train)
|
||||||
|
|
||||||
|
sims = test()
|
||||||
|
for count, sim in sims:
|
||||||
|
sentence = x_train[count]
|
||||||
|
words = ''
|
||||||
|
for word in sentence[0]:
|
||||||
|
words = words + word + ' '
|
||||||
|
print(words, sim, len(sentence[0]))
|
||||||
在新工单中引用
屏蔽一个用户