diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/Medical-nlp.iml b/.idea/Medical-nlp.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/Medical-nlp.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..a2e120d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..88b6074 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9d6b909..0f687c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ gensim flask -flasgger \ No newline at end of file +flasgger +jieba \ No newline at end of file diff --git a/src/doc2vec_data.py b/src/doc2vec_data.py new file mode 100644 index 0000000..4c1ec4b --- /dev/null +++ b/src/doc2vec_data.py @@ -0,0 +1,60 @@ +import gensim +import numpy as np +import jieba +from gensim.models.doc2vec import Doc2Vec,TaggedDocument + + +def jieba_tokenize(text): + """ + 文本分词 + :param text: 文本 + :return: 分词list + """ + return jieba.lcut(text) + + +def get_datasest(): + """ + 获取doc2vec文本训练数据集 + :return: 文本分词list,及id + """ + x_train = [] + for file in open('toutiao_cat_data.txt', encoding='utf8'): + file = file.split('_!_') + if len(file) > 3: + document = TaggededDocument(file[3], tags=[int(file[1])]) + x_train.append(document) + return x_train + + +def train(x_train, size=2000, epoch_num=10): + model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4) + model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=epoch_num) + model_dm.save('model') + return model_dm + + +def getVecs(model, corpus, size): + vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, size)) for z in corpus] + return np.concatenate(vecs) + + +def test(): + model_dm = Doc2Vec.load("model") + test_text = ['想换个', '30', '万左右', '的', '车', ',', '现在', '开科鲁兹', ',', '有', '什么', '好', '推荐', '的', '?'] + inferred_vector_dm = model_dm.infer_vector(test_text) + sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10) + return sims + + +if __name__ == '__main__': + x_train = get_datasest() + model_dm = train(x_train) + + sims = test() + for count, sim in sims: + sentence = x_train[count] + words = '' + for word in sentence[0]: + words = words + word + ' ' + print(words, sim, len(sentence[0]))