diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 231f084..7c69f9c 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -20,7 +20,8 @@ - + + - + + + + + @@ -141,22 +164,22 @@ - + - - + + - - + + - - + + - + diff --git a/src/zhiwang_dict.py b/src/zhiwang_dict.py new file mode 100644 index 0000000..13d7e88 --- /dev/null +++ b/src/zhiwang_dict.py @@ -0,0 +1,59 @@ +import json +import re +import requests +from bs4 import BeautifulSoup + + +def zhiwang(id): + """ + 基于词语知网数据查询 + :param id: 页面id + :return: 查询返回json包 + """ + headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", + "Cache-Control": "max-age=0", + "Connection": "keep-alive", "Host": "wiki.cnki.com.cn", + "Upgrade-Insecure-Requests": "1", + "Cookie": "UM_distinctid=173c16e519d73a-058630ec26edf5-3323765-144000-173c16e519e6d0; SID_wiki=018060; CNZZDATA3412177=cnzz_eid%3D2011079095-1596676096-null%26ntime%3D1597380140", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"} + url = "http://wiki.cnki.com.cn/HotWord/"+str(id)+".htm" + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, 'html.parser') + content = {} + for element in soup.find_all(class_='explain'): + for element_one in element.find_all('p'): + content['概述'] = re.sub(('[\n \r \t :\xa0\u3000]'), '', element_one.get_text()) + list_content = [] + list_content_key = ['相似词', '相关词'] + for element in soup.find_all(class_='sidepan2'): + for element_one in element.find_all(class_='thesis'): + word_value = [] + for element_three in element_one.find_all("ul"): + for element_four in element_three.find_all('li'): + for element_five in element_four.find_all('a'): + word_value.append(element_five['title']) + list_content.append(word_value) + dict_content_list = dict(zip(list_content_key, list_content)) + content.update(dict_content_list) + qikan_list = [] + words = [] + for element in soup.find_all(class_='essay_list'): + for element_one in element.find_all('dl'): + for element_two in element_one.find_all('dt'): + for element_three in element_two.find_all('strong'): + for element_four in element_three.find_all('a'): + qikan_list.append('http:' + element_four['href']) + temp_word = [] + for element_five in element_one.find_all(class_='infor'): + for element_six in element_five.find_all('a'): + temp_word.append(element_six.get_text()) + words.append(temp_word) + content['期刊url'] = qikan_list + content['期刊word'] = words + return content + + +if __name__ == '__main__': + print(json.dumps(zhiwang('3718950'), ensure_ascii=False, indent=2))