镜像自地址
https://gitee.com/medical-alliance/Medical-nlp.git
已同步 2025-12-06 01:16:47 +00:00
增加医学辞典构建
这个提交包含在:
43
.idea/workspace.xml
自动生成的
43
.idea/workspace.xml
自动生成的
@@ -20,7 +20,8 @@
|
|||||||
</component>
|
</component>
|
||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="818087ba-af84-4167-ab4f-8ca76742a4b1" name="Default" comment="">
|
<list default="true" id="818087ba-af84-4167-ab4f-8ca76742a4b1" name="Default" comment="">
|
||||||
<change beforePath="$PROJECT_DIR$/.gitignore" beforeDir="false" afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
|
<change afterPath="$PROJECT_DIR$/src/zhiwang_dict.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||||
</list>
|
</list>
|
||||||
<option name="SHOW_DIALOG" value="false" />
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
@@ -53,7 +54,7 @@
|
|||||||
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
||||||
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
|
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
|
||||||
</component>
|
</component>
|
||||||
<component name="RunManager" selected="Python.text_sim">
|
<component name="RunManager" selected="Python.zhiwang_dict">
|
||||||
<configuration name="baidu" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
<configuration name="baidu" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||||
<module name="Medical-nlp" />
|
<module name="Medical-nlp" />
|
||||||
<option name="INTERPRETER_OPTIONS" value="" />
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
@@ -96,6 +97,27 @@
|
|||||||
<option name="INPUT_FILE" value="" />
|
<option name="INPUT_FILE" value="" />
|
||||||
<method v="2" />
|
<method v="2" />
|
||||||
</configuration>
|
</configuration>
|
||||||
|
<configuration name="zhiwang_dict" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||||
|
<module name="Medical-nlp" />
|
||||||
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
|
<option name="PARENT_ENVS" value="true" />
|
||||||
|
<envs>
|
||||||
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
</envs>
|
||||||
|
<option name="SDK_HOME" value="" />
|
||||||
|
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
|
||||||
|
<option name="IS_MODULE_SDK" value="true" />
|
||||||
|
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||||
|
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||||
|
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/zhiwang_dict.py" />
|
||||||
|
<option name="PARAMETERS" value="" />
|
||||||
|
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||||
|
<option name="EMULATE_TERMINAL" value="false" />
|
||||||
|
<option name="MODULE_MODE" value="false" />
|
||||||
|
<option name="REDIRECT_INPUT" value="false" />
|
||||||
|
<option name="INPUT_FILE" value="" />
|
||||||
|
<method v="2" />
|
||||||
|
</configuration>
|
||||||
<configuration default="true" type="tests" factoryName="Nosetests">
|
<configuration default="true" type="tests" factoryName="Nosetests">
|
||||||
<module name="Medical-nlp" />
|
<module name="Medical-nlp" />
|
||||||
<option name="INTERPRETER_OPTIONS" value="" />
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
@@ -113,6 +135,7 @@
|
|||||||
</configuration>
|
</configuration>
|
||||||
<recent_temporary>
|
<recent_temporary>
|
||||||
<list>
|
<list>
|
||||||
|
<item itemvalue="Python.zhiwang_dict" />
|
||||||
<item itemvalue="Python.text_sim" />
|
<item itemvalue="Python.text_sim" />
|
||||||
<item itemvalue="Python.baidu" />
|
<item itemvalue="Python.baidu" />
|
||||||
</list>
|
</list>
|
||||||
@@ -141,22 +164,22 @@
|
|||||||
</option>
|
</option>
|
||||||
</component>
|
</component>
|
||||||
<component name="WindowStateProjectService">
|
<component name="WindowStateProjectService">
|
||||||
<state width="1515" height="210" key="GridCell.Tab.0.bottom" timestamp="1602081675604">
|
<state width="1515" height="210" key="GridCell.Tab.0.bottom" timestamp="1602081943103">
|
||||||
<screen x="0" y="0" width="1536" height="824" />
|
<screen x="0" y="0" width="1536" height="824" />
|
||||||
</state>
|
</state>
|
||||||
<state width="1515" height="210" key="GridCell.Tab.0.bottom/0.0.1536.824@0.0.1536.824" timestamp="1602081675604" />
|
<state width="1515" height="210" key="GridCell.Tab.0.bottom/0.0.1536.824@0.0.1536.824" timestamp="1602081943103" />
|
||||||
<state width="1515" height="210" key="GridCell.Tab.0.center" timestamp="1602081675604">
|
<state width="1515" height="210" key="GridCell.Tab.0.center" timestamp="1602081943102">
|
||||||
<screen x="0" y="0" width="1536" height="824" />
|
<screen x="0" y="0" width="1536" height="824" />
|
||||||
</state>
|
</state>
|
||||||
<state width="1515" height="210" key="GridCell.Tab.0.center/0.0.1536.824@0.0.1536.824" timestamp="1602081675604" />
|
<state width="1515" height="210" key="GridCell.Tab.0.center/0.0.1536.824@0.0.1536.824" timestamp="1602081943102" />
|
||||||
<state width="1515" height="210" key="GridCell.Tab.0.left" timestamp="1602081675603">
|
<state width="1515" height="210" key="GridCell.Tab.0.left" timestamp="1602081943102">
|
||||||
<screen x="0" y="0" width="1536" height="824" />
|
<screen x="0" y="0" width="1536" height="824" />
|
||||||
</state>
|
</state>
|
||||||
<state width="1515" height="210" key="GridCell.Tab.0.left/0.0.1536.824@0.0.1536.824" timestamp="1602081675603" />
|
<state width="1515" height="210" key="GridCell.Tab.0.left/0.0.1536.824@0.0.1536.824" timestamp="1602081943102" />
|
||||||
<state width="1515" height="210" key="GridCell.Tab.0.right" timestamp="1602081675604">
|
<state width="1515" height="210" key="GridCell.Tab.0.right" timestamp="1602081943102">
|
||||||
<screen x="0" y="0" width="1536" height="824" />
|
<screen x="0" y="0" width="1536" height="824" />
|
||||||
</state>
|
</state>
|
||||||
<state width="1515" height="210" key="GridCell.Tab.0.right/0.0.1536.824@0.0.1536.824" timestamp="1602081675604" />
|
<state width="1515" height="210" key="GridCell.Tab.0.right/0.0.1536.824@0.0.1536.824" timestamp="1602081943102" />
|
||||||
<state x="431" y="145" width="672" height="678" key="search.everywhere.popup" timestamp="1599057981367">
|
<state x="431" y="145" width="672" height="678" key="search.everywhere.popup" timestamp="1599057981367">
|
||||||
<screen x="0" y="0" width="1536" height="824" />
|
<screen x="0" y="0" width="1536" height="824" />
|
||||||
</state>
|
</state>
|
||||||
|
|||||||
59
src/zhiwang_dict.py
普通文件
59
src/zhiwang_dict.py
普通文件
@@ -0,0 +1,59 @@
|
|||||||
|
import json
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def zhiwang(id):
|
||||||
|
"""
|
||||||
|
基于词语知网数据查询
|
||||||
|
:param id: 页面id
|
||||||
|
:return: 查询返回json包
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||||
|
"Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9",
|
||||||
|
"Cache-Control": "max-age=0",
|
||||||
|
"Connection": "keep-alive", "Host": "wiki.cnki.com.cn",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Cookie": "UM_distinctid=173c16e519d73a-058630ec26edf5-3323765-144000-173c16e519e6d0; SID_wiki=018060; CNZZDATA3412177=cnzz_eid%3D2011079095-1596676096-null%26ntime%3D1597380140",
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}
|
||||||
|
url = "http://wiki.cnki.com.cn/HotWord/"+str(id)+".htm"
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
content = {}
|
||||||
|
for element in soup.find_all(class_='explain'):
|
||||||
|
for element_one in element.find_all('p'):
|
||||||
|
content['概述'] = re.sub(('[\n \r \t :\xa0\u3000]'), '', element_one.get_text())
|
||||||
|
list_content = []
|
||||||
|
list_content_key = ['相似词', '相关词']
|
||||||
|
for element in soup.find_all(class_='sidepan2'):
|
||||||
|
for element_one in element.find_all(class_='thesis'):
|
||||||
|
word_value = []
|
||||||
|
for element_three in element_one.find_all("ul"):
|
||||||
|
for element_four in element_three.find_all('li'):
|
||||||
|
for element_five in element_four.find_all('a'):
|
||||||
|
word_value.append(element_five['title'])
|
||||||
|
list_content.append(word_value)
|
||||||
|
dict_content_list = dict(zip(list_content_key, list_content))
|
||||||
|
content.update(dict_content_list)
|
||||||
|
qikan_list = []
|
||||||
|
words = []
|
||||||
|
for element in soup.find_all(class_='essay_list'):
|
||||||
|
for element_one in element.find_all('dl'):
|
||||||
|
for element_two in element_one.find_all('dt'):
|
||||||
|
for element_three in element_two.find_all('strong'):
|
||||||
|
for element_four in element_three.find_all('a'):
|
||||||
|
qikan_list.append('http:' + element_four['href'])
|
||||||
|
temp_word = []
|
||||||
|
for element_five in element_one.find_all(class_='infor'):
|
||||||
|
for element_six in element_five.find_all('a'):
|
||||||
|
temp_word.append(element_six.get_text())
|
||||||
|
words.append(temp_word)
|
||||||
|
content['期刊url'] = qikan_list
|
||||||
|
content['期刊word'] = words
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(json.dumps(zhiwang('3718950'), ensure_ascii=False, indent=2))
|
||||||
在新工单中引用
屏蔽一个用户