增加hanlp

这个提交包含在:
13564180096
2020-09-01 14:40:46 +08:00
父节点 185c840ff3
当前提交 6bde4ea8b6
共有 12 个文件被更改,包括 426 次插入767 次删除

119
src/get_words.py 普通文件
查看文件

@@ -0,0 +1,119 @@
from pyhanlp import *
import gc
def group_consecutives(vals, step=1):
"""Return list of consecutive lists of numbers from vals (number list)."""
run = []
result = [run]
expect = None
for v in vals:
if (v == expect) or (expect is None):
run.append(v)
else:
run = [v]
result.append(run)
expect = v + step
return result
def find_word(text, words, del_status):
CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
if len(words) != 0:
for element in words:
CustomDictionary.add(element)
sentence = HanLP.parseDependency(text)
data = []
word_array = sentence.getWordArray()
word_cut = []
status = []
location = []
# print(seg_text)
# print(len(word_array))
for i, word in enumerate(word_array):
# print("%s --(%s)--> %s" % (word.LEMMA, word.DEPREL, word.HEAD.LEMMA))
word_cut.append(word.LEMMA)
# if word.DEPREL == "定中关系" or word.DEPREL == "动宾关系":
# if word.DEPREL == "动宾关系" and word.LEMMA+word.HEAD.LEMMA not in text:
# data.append([i, word.HEAD.LEMMA, word.DEPREL, word.LEMMA])
# else:
# data.append([i, word.LEMMA, word.DEPREL, word.HEAD.LEMMA])
if word.DEPREL == "定中关系":
data.append([i, word.LEMMA, word.DEPREL, word.HEAD.LEMMA])
elif word.DEPREL == "右附加关系":
status.append([word.HEAD.LEMMA, word.HEAD.LEMMA + word.LEMMA])
if status:
for element in data:
for element_one in status:
if element_one[0] in element[1] and element_one[1] + element[3] in text:
element[1] = element_one[1]
for element in data:
if element[1] + element[3] not in text:
continue
else:
location.append(element[0])
location_split = group_consecutives(location)
result = []
if location_split is not []:
for children in location_split:
ans_data = ""
if len(children) > 1:
for child in children:
for element in data:
if element[0] == child:
ans_data = ans_data + element[1]
for element in data:
if element[0] == children[len(children) - 1]:
ans_data = ans_data + element[3]
elif data is not [] and children != []:
for element in data:
if element[0] == children[0]:
ans_data = element[1] + element[3]
if ans_data:
result.append(ans_data)
if del_status == 0:
del words
del sentence
del word_array
for x in locals().keys():
del locals()[x]
gc.collect()
return result, word_cut
def text_get(text, words):
f = "add_words.txt"
with open(f, encoding="utf8") as f_data:
all_text = f_data.readlines()
f_data.close()
with open(f, "a+") as file:
if words != []:
for i in words:
temp = str(i) + "\n"
if temp not in all_text:
file.write(temp)
file.close()
# words = []
seg_data = []
count = 0
new_word = []
while True:
result, word_cut = find_word(text, words, 1)
seg_data.append(word_cut)
words.extend(result)
new_word.append([result, count])
count = count + 1
if len(result) == 0 or count > 6:
break
for x in locals().keys():
del locals()[x]
gc.collect()
CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
for element in words:
CustomDictionary.remove(element)
return new_word, seg_data
# print(text_get("症状80%以上的患者在短时间内突然发生呼吸困难、烦躁不安、多汗、心悸、胸痛。",["抗凝药物"]))

查看文件

@@ -29,4 +29,4 @@ def to_kg():
if __name__ == '__main__':
app.run('0.0.0.0', port=8020)
~