增加hanlp

2025-12-06 17:36:47 +00:00 · 2020-09-01 14:40:46 +08:00
--- a/src/get_words.py
+++ b/src/get_words.py
@@ -0,0 +1,119 @@
+from pyhanlp import *
+import gc
+
+
+def group_consecutives(vals, step=1):
+    """Return list of consecutive lists of numbers from vals (number list)."""
+    run = []
+    result = [run]
+    expect = None
+    for v in vals:
+        if (v == expect) or (expect is None):
+            run.append(v)
+        else:
+            run = [v]
+            result.append(run)
+        expect = v + step
+    return result
+
+
+def find_word(text, words, del_status):
+    CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
+    if len(words) != 0:
+        for element in words:
+            CustomDictionary.add(element)
+
+    sentence = HanLP.parseDependency(text)
+    data = []
+    word_array = sentence.getWordArray()
+    word_cut = []
+    status = []
+    location = []
+    # print(seg_text)
+    # print(len(word_array))
+    for i, word in enumerate(word_array):
+        # print("%s --(%s)--> %s" % (word.LEMMA, word.DEPREL, word.HEAD.LEMMA))
+        word_cut.append(word.LEMMA)
+        # if word.DEPREL == "定中关系" or word.DEPREL == "动宾关系":
+        #     if word.DEPREL == "动宾关系" and word.LEMMA+word.HEAD.LEMMA not in text:
+        #         data.append([i, word.HEAD.LEMMA, word.DEPREL, word.LEMMA])
+        #     else:
+        #         data.append([i, word.LEMMA, word.DEPREL, word.HEAD.LEMMA])
+        if word.DEPREL == "定中关系":
+            data.append([i, word.LEMMA, word.DEPREL, word.HEAD.LEMMA])
+        elif word.DEPREL == "右附加关系":
+            status.append([word.HEAD.LEMMA, word.HEAD.LEMMA + word.LEMMA])
+    if status:
+        for element in data:
+            for element_one in status:
+                if element_one[0] in element[1] and element_one[1] + element[3] in text:
+                    element[1] = element_one[1]
+    for element in data:
+        if element[1] + element[3] not in text:
+            continue
+        else:
+            location.append(element[0])
+    location_split = group_consecutives(location)
+    result = []
+    if location_split is not []:
+        for children in location_split:
+            ans_data = ""
+            if len(children) > 1:
+                for child in children:
+                    for element in data:
+                        if element[0] == child:
+                            ans_data = ans_data + element[1]
+
+                for element in data:
+                    if element[0] == children[len(children) - 1]:
+                        ans_data = ans_data + element[3]
+            elif data is not [] and children != []:
+                for element in data:
+                    if element[0] == children[0]:
+                        ans_data = element[1] + element[3]
+            if ans_data:
+                result.append(ans_data)
+    if del_status == 0:
+        del words
+        del sentence
+        del word_array
+        for x in locals().keys():
+            del locals()[x]
+            gc.collect()
+    return result, word_cut
+
+
+def text_get(text, words):
+    f = "add_words.txt"
+    with open(f, encoding="utf8") as f_data:
+        all_text = f_data.readlines()
+        f_data.close()
+    with open(f, "a+") as file:
+        if words != []:
+            for i in words:
+                temp = str(i) + "\n"
+                if temp not in all_text:
+                    file.write(temp)
+    file.close()
+
+    # words = []
+    seg_data = []
+    count = 0
+    new_word = []
+    while True:
+        result, word_cut = find_word(text, words, 1)
+        seg_data.append(word_cut)
+        words.extend(result)
+        new_word.append([result, count])
+        count = count + 1
+        if len(result) == 0 or count > 6:
+            break
+    for x in locals().keys():
+        del locals()[x]
+        gc.collect()
+    CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
+    for element in words:
+        CustomDictionary.remove(element)
+    return new_word, seg_data
+
+# print(text_get("症状80%以上的患者在短时间内突然发生呼吸困难、烦躁不安、多汗、心悸、胸痛。",["抗凝药物"]))
--- a/src/tenxun.py
+++ b/src/tenxun.py
@@ -29,4 +29,4 @@ def to_kg():

 if __name__ == '__main__':
    app.run('0.0.0.0', port=8020)
-~
+