镜像自地址
https://gitee.com/medical-alliance/Medical-nlp.git
已同步 2025-12-06 17:36:47 +00:00
增加hanlp
这个提交包含在:
119
src/get_words.py
普通文件
119
src/get_words.py
普通文件
@@ -0,0 +1,119 @@
|
||||
from pyhanlp import *
|
||||
import gc
|
||||
|
||||
|
||||
def group_consecutives(vals, step=1):
|
||||
"""Return list of consecutive lists of numbers from vals (number list)."""
|
||||
run = []
|
||||
result = [run]
|
||||
expect = None
|
||||
for v in vals:
|
||||
if (v == expect) or (expect is None):
|
||||
run.append(v)
|
||||
else:
|
||||
run = [v]
|
||||
result.append(run)
|
||||
expect = v + step
|
||||
return result
|
||||
|
||||
|
||||
def find_word(text, words, del_status):
|
||||
CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
|
||||
if len(words) != 0:
|
||||
for element in words:
|
||||
CustomDictionary.add(element)
|
||||
|
||||
sentence = HanLP.parseDependency(text)
|
||||
data = []
|
||||
word_array = sentence.getWordArray()
|
||||
word_cut = []
|
||||
status = []
|
||||
location = []
|
||||
# print(seg_text)
|
||||
# print(len(word_array))
|
||||
for i, word in enumerate(word_array):
|
||||
# print("%s --(%s)--> %s" % (word.LEMMA, word.DEPREL, word.HEAD.LEMMA))
|
||||
word_cut.append(word.LEMMA)
|
||||
# if word.DEPREL == "定中关系" or word.DEPREL == "动宾关系":
|
||||
# if word.DEPREL == "动宾关系" and word.LEMMA+word.HEAD.LEMMA not in text:
|
||||
# data.append([i, word.HEAD.LEMMA, word.DEPREL, word.LEMMA])
|
||||
# else:
|
||||
# data.append([i, word.LEMMA, word.DEPREL, word.HEAD.LEMMA])
|
||||
if word.DEPREL == "定中关系":
|
||||
data.append([i, word.LEMMA, word.DEPREL, word.HEAD.LEMMA])
|
||||
elif word.DEPREL == "右附加关系":
|
||||
status.append([word.HEAD.LEMMA, word.HEAD.LEMMA + word.LEMMA])
|
||||
if status:
|
||||
for element in data:
|
||||
for element_one in status:
|
||||
if element_one[0] in element[1] and element_one[1] + element[3] in text:
|
||||
element[1] = element_one[1]
|
||||
for element in data:
|
||||
if element[1] + element[3] not in text:
|
||||
continue
|
||||
else:
|
||||
location.append(element[0])
|
||||
location_split = group_consecutives(location)
|
||||
result = []
|
||||
if location_split is not []:
|
||||
for children in location_split:
|
||||
ans_data = ""
|
||||
if len(children) > 1:
|
||||
for child in children:
|
||||
for element in data:
|
||||
if element[0] == child:
|
||||
ans_data = ans_data + element[1]
|
||||
|
||||
for element in data:
|
||||
if element[0] == children[len(children) - 1]:
|
||||
ans_data = ans_data + element[3]
|
||||
elif data is not [] and children != []:
|
||||
for element in data:
|
||||
if element[0] == children[0]:
|
||||
ans_data = element[1] + element[3]
|
||||
if ans_data:
|
||||
result.append(ans_data)
|
||||
if del_status == 0:
|
||||
del words
|
||||
del sentence
|
||||
del word_array
|
||||
for x in locals().keys():
|
||||
del locals()[x]
|
||||
gc.collect()
|
||||
return result, word_cut
|
||||
|
||||
|
||||
def text_get(text, words):
|
||||
f = "add_words.txt"
|
||||
with open(f, encoding="utf8") as f_data:
|
||||
all_text = f_data.readlines()
|
||||
f_data.close()
|
||||
with open(f, "a+") as file:
|
||||
if words != []:
|
||||
for i in words:
|
||||
temp = str(i) + "\n"
|
||||
if temp not in all_text:
|
||||
file.write(temp)
|
||||
file.close()
|
||||
|
||||
# words = []
|
||||
seg_data = []
|
||||
count = 0
|
||||
new_word = []
|
||||
while True:
|
||||
result, word_cut = find_word(text, words, 1)
|
||||
seg_data.append(word_cut)
|
||||
words.extend(result)
|
||||
new_word.append([result, count])
|
||||
count = count + 1
|
||||
if len(result) == 0 or count > 6:
|
||||
break
|
||||
for x in locals().keys():
|
||||
del locals()[x]
|
||||
gc.collect()
|
||||
CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
|
||||
for element in words:
|
||||
CustomDictionary.remove(element)
|
||||
return new_word, seg_data
|
||||
|
||||
# print(text_get("症状80%以上的患者在短时间内突然发生呼吸困难、烦躁不安、多汗、心悸、胸痛。",["抗凝药物"]))
|
||||
@@ -29,4 +29,4 @@ def to_kg():
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run('0.0.0.0', port=8020)
|
||||
~
|
||||
|
||||
|
||||
在新工单中引用
屏蔽一个用户