From babb0584b4036811d3526256c8bc44dfdf52fba1 Mon Sep 17 00:00:00 2001
From: lidunwei <13564180096>
Date: Tue, 25 Aug 2020 21:39:14 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0doc2vec?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.idea/.gitignore | 3 +
.idea/Medical-nlp.iml | 8 +++
.../inspectionProfiles/profiles_settings.xml | 6 ++
.idea/misc.xml | 4 ++
.idea/modules.xml | 8 +++
.idea/vcs.xml | 6 ++
requirements.txt | 3 +-
src/doc2vec_data.py | 60 +++++++++++++++++++
8 files changed, 97 insertions(+), 1 deletion(-)
create mode 100644 .idea/.gitignore
create mode 100644 .idea/Medical-nlp.iml
create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
create mode 100644 .idea/misc.xml
create mode 100644 .idea/modules.xml
create mode 100644 .idea/vcs.xml
create mode 100644 src/doc2vec_data.py
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/Medical-nlp.iml b/.idea/Medical-nlp.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/.idea/Medical-nlp.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..a2e120d
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..88b6074
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 9d6b909..0f687c7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
gensim
flask
-flasgger
\ No newline at end of file
+flasgger
+jieba
\ No newline at end of file
diff --git a/src/doc2vec_data.py b/src/doc2vec_data.py
new file mode 100644
index 0000000..4c1ec4b
--- /dev/null
+++ b/src/doc2vec_data.py
@@ -0,0 +1,60 @@
+import gensim
+import numpy as np
+import jieba
+from gensim.models.doc2vec import Doc2Vec,TaggedDocument
+
+
+def jieba_tokenize(text):
+ """
+ 文本分词
+ :param text: 文本
+ :return: 分词list
+ """
+ return jieba.lcut(text)
+
+
+def get_datasest():
+ """
+ 获取doc2vec文本训练数据集
+ :return: 文本分词list,及id
+ """
+ x_train = []
+ for file in open('toutiao_cat_data.txt', encoding='utf8'):
+ file = file.split('_!_')
+ if len(file) > 3:
+ document = TaggededDocument(file[3], tags=[int(file[1])])
+ x_train.append(document)
+ return x_train
+
+
+def train(x_train, size=2000, epoch_num=10):
+ model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
+ model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=epoch_num)
+ model_dm.save('model')
+ return model_dm
+
+
+def getVecs(model, corpus, size):
+ vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, size)) for z in corpus]
+ return np.concatenate(vecs)
+
+
+def test():
+ model_dm = Doc2Vec.load("model")
+ test_text = ['想换个', '30', '万左右', '的', '车', ',', '现在', '开科鲁兹', ',', '有', '什么', '好', '推荐', '的', '?']
+ inferred_vector_dm = model_dm.infer_vector(test_text)
+ sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
+ return sims
+
+
+if __name__ == '__main__':
+ x_train = get_datasest()
+ model_dm = train(x_train)
+
+ sims = test()
+ for count, sim in sims:
+ sentence = x_train[count]
+ words = ''
+ for word in sentence[0]:
+ words = words + word + ' '
+ print(words, sim, len(sentence[0]))