镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 06:26:47 +00:00
220 行
6.1 KiB
Python
220 行
6.1 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, Optional, Union
|
|
from urllib.parse import urlparse
|
|
import logging
|
|
import trafilatura
|
|
import requests
|
|
from pathlib import Path
|
|
|
|
|
|
@dataclass
|
|
class WebExtractorConfig:
|
|
"""网页内容提取器配置类
|
|
|
|
Attributes:
|
|
extract_comments: 是否提取评论
|
|
extract_tables: 是否提取表格
|
|
extract_links: 是否保留链接信息
|
|
paragraph_separator: 段落分隔符
|
|
timeout: 网络请求超时时间(秒)
|
|
max_retries: 最大重试次数
|
|
user_agent: 自定义User-Agent
|
|
text_cleanup: 文本清理选项
|
|
"""
|
|
extract_comments: bool = False
|
|
extract_tables: bool = True
|
|
extract_links: bool = False
|
|
paragraph_separator: str = '\n\n'
|
|
timeout: int = 10
|
|
max_retries: int = 3
|
|
user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
text_cleanup: Dict[str, bool] = field(default_factory=lambda: {
|
|
'remove_extra_spaces': True,
|
|
'normalize_whitespace': True,
|
|
'remove_special_chars': False,
|
|
'lowercase': False
|
|
})
|
|
|
|
|
|
class WebTextExtractor:
|
|
"""网页文本内容提取器
|
|
|
|
使用trafilatura库提取网页中的主要文本内容,去除广告、导航等无关内容。
|
|
"""
|
|
|
|
def __init__(self, config: Optional[WebExtractorConfig] = None):
|
|
"""初始化提取器
|
|
|
|
Args:
|
|
config: 提取器配置对象,如果为None则使用默认配置
|
|
"""
|
|
self.config = config or WebExtractorConfig()
|
|
self._setup_logging()
|
|
|
|
def _setup_logging(self) -> None:
|
|
"""配置日志记录器"""
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# 添加文件处理器
|
|
fh = logging.FileHandler('web_extractor.log')
|
|
fh.setLevel(logging.ERROR)
|
|
self.logger.addHandler(fh)
|
|
|
|
def _validate_url(self, url: str) -> bool:
|
|
"""验证URL格式是否有效
|
|
|
|
Args:
|
|
url: 网页URL
|
|
|
|
Returns:
|
|
bool: URL是否有效
|
|
"""
|
|
try:
|
|
result = urlparse(url)
|
|
return all([result.scheme, result.netloc])
|
|
except Exception:
|
|
return False
|
|
|
|
def _download_webpage(self, url: str) -> Optional[str]:
|
|
"""下载网页内容
|
|
|
|
Args:
|
|
url: 网页URL
|
|
|
|
Returns:
|
|
Optional[str]: 网页HTML内容,失败返回None
|
|
|
|
Raises:
|
|
Exception: 下载失败时抛出异常
|
|
"""
|
|
headers = {'User-Agent': self.config.user_agent}
|
|
|
|
for attempt in range(self.config.max_retries):
|
|
try:
|
|
response = requests.get(
|
|
url,
|
|
headers=headers,
|
|
timeout=self.config.timeout
|
|
)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except requests.RequestException as e:
|
|
self.logger.warning(f"Attempt {attempt + 1} failed: {e}")
|
|
if attempt == self.config.max_retries - 1:
|
|
raise Exception(f"Failed to download webpage after {self.config.max_retries} attempts: {e}")
|
|
return None
|
|
|
|
def _cleanup_text(self, text: str) -> str:
|
|
"""清理文本
|
|
|
|
Args:
|
|
text: 原始文本
|
|
|
|
Returns:
|
|
str: 清理后的文本
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
if self.config.text_cleanup['remove_extra_spaces']:
|
|
text = ' '.join(text.split())
|
|
|
|
if self.config.text_cleanup['normalize_whitespace']:
|
|
text = text.replace('\t', ' ').replace('\r', '\n')
|
|
|
|
if self.config.text_cleanup['lowercase']:
|
|
text = text.lower()
|
|
|
|
return text.strip()
|
|
|
|
def extract_text(self, url: str) -> str:
|
|
"""提取网页文本内容
|
|
|
|
Args:
|
|
url: 网页URL
|
|
|
|
Returns:
|
|
str: 提取的文本内容
|
|
|
|
Raises:
|
|
ValueError: URL无效时抛出
|
|
Exception: 提取失败时抛出
|
|
"""
|
|
try:
|
|
if not self._validate_url(url):
|
|
raise ValueError(f"Invalid URL: {url}")
|
|
|
|
self.logger.info(f"Processing URL: {url}")
|
|
|
|
# 下载网页
|
|
html_content = self._download_webpage(url)
|
|
if not html_content:
|
|
raise Exception("Failed to download webpage")
|
|
|
|
# 配置trafilatura提取选项
|
|
extract_config = {
|
|
'include_comments': self.config.extract_comments,
|
|
'include_tables': self.config.extract_tables,
|
|
'include_links': self.config.extract_links,
|
|
'no_fallback': False, # 允许使用后备提取器
|
|
}
|
|
|
|
# 提取文本
|
|
extracted_text = trafilatura.extract(
|
|
html_content,
|
|
**extract_config
|
|
)
|
|
|
|
if not extracted_text:
|
|
raise Exception("No content could be extracted")
|
|
|
|
# 清理文本
|
|
cleaned_text = self._cleanup_text(extracted_text)
|
|
|
|
return cleaned_text
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Extraction failed: {e}")
|
|
raise
|
|
|
|
|
|
def main():
|
|
"""主函数:演示用法"""
|
|
# 配置
|
|
config = WebExtractorConfig(
|
|
extract_comments=False,
|
|
extract_tables=True,
|
|
extract_links=False,
|
|
timeout=10,
|
|
text_cleanup={
|
|
'remove_extra_spaces': True,
|
|
'normalize_whitespace': True,
|
|
'remove_special_chars': False,
|
|
'lowercase': False
|
|
}
|
|
)
|
|
|
|
# 创建提取器
|
|
extractor = WebTextExtractor(config)
|
|
|
|
# 使用示例
|
|
try:
|
|
# 替换为实际的URL
|
|
sample_url = 'https://arxiv.org/abs/2412.00036'
|
|
text = extractor.extract_text(sample_url)
|
|
print("提取的文本:")
|
|
print(text)
|
|
|
|
except Exception as e:
|
|
print(f"错误: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|