From 3e447090cd36f9795c7027da7183d3735eb77b8c Mon Sep 17 00:00:00 2001
From: lidunwei <13564180096@163.com>
Date: Wed, 7 Oct 2020 22:52:10 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=8C=BB=E5=AD=A6=E8=BE=9E?=
=?UTF-8?q?=E5=85=B8=E6=9E=84=E5=BB=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.idea/workspace.xml | 43 +++++++++++++++++++++++++--------
src/zhiwang_dict.py | 59 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 92 insertions(+), 10 deletions(-)
create mode 100644 src/zhiwang_dict.py
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 231f084..7c69f9c 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -20,7 +20,8 @@
-
+
+
@@ -53,7 +54,7 @@
-
+
@@ -96,6 +97,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -113,6 +135,7 @@
+
@@ -141,22 +164,22 @@
-
+
-
-
+
+
-
-
+
+
-
-
+
+
-
+
diff --git a/src/zhiwang_dict.py b/src/zhiwang_dict.py
new file mode 100644
index 0000000..13d7e88
--- /dev/null
+++ b/src/zhiwang_dict.py
@@ -0,0 +1,59 @@
+import json
+import re
+import requests
+from bs4 import BeautifulSoup
+
+
+def zhiwang(id):
+ """
+ 基于词语知网数据查询
+ :param id: 页面id
+ :return: 查询返回json包
+ """
+ headers = {
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+ "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9",
+ "Cache-Control": "max-age=0",
+ "Connection": "keep-alive", "Host": "wiki.cnki.com.cn",
+ "Upgrade-Insecure-Requests": "1",
+ "Cookie": "UM_distinctid=173c16e519d73a-058630ec26edf5-3323765-144000-173c16e519e6d0; SID_wiki=018060; CNZZDATA3412177=cnzz_eid%3D2011079095-1596676096-null%26ntime%3D1597380140",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}
+ url = "http://wiki.cnki.com.cn/HotWord/"+str(id)+".htm"
+ response = requests.get(url, headers=headers)
+ soup = BeautifulSoup(response.text, 'html.parser')
+ content = {}
+ for element in soup.find_all(class_='explain'):
+ for element_one in element.find_all('p'):
+ content['概述'] = re.sub(('[\n \r \t :\xa0\u3000]'), '', element_one.get_text())
+ list_content = []
+ list_content_key = ['相似词', '相关词']
+ for element in soup.find_all(class_='sidepan2'):
+ for element_one in element.find_all(class_='thesis'):
+ word_value = []
+ for element_three in element_one.find_all("ul"):
+ for element_four in element_three.find_all('li'):
+ for element_five in element_four.find_all('a'):
+ word_value.append(element_five['title'])
+ list_content.append(word_value)
+ dict_content_list = dict(zip(list_content_key, list_content))
+ content.update(dict_content_list)
+ qikan_list = []
+ words = []
+ for element in soup.find_all(class_='essay_list'):
+ for element_one in element.find_all('dl'):
+ for element_two in element_one.find_all('dt'):
+ for element_three in element_two.find_all('strong'):
+ for element_four in element_three.find_all('a'):
+ qikan_list.append('http:' + element_four['href'])
+ temp_word = []
+ for element_five in element_one.find_all(class_='infor'):
+ for element_six in element_five.find_all('a'):
+ temp_word.append(element_six.get_text())
+ words.append(temp_word)
+ content['期刊url'] = qikan_list
+ content['期刊word'] = words
+ return content
+
+
+if __name__ == '__main__':
+ print(json.dumps(zhiwang('3718950'), ensure_ascii=False, indent=2))