镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 14:36:48 +00:00
Master 4.0 (#2210)
* stage academic conversation * stage document conversation * fix buggy gradio version * file dynamic load * merge more academic plugins * accelerate nltk * feat: 为predict函数添加文件和URL读取功能 - 添加URL检测和网页内容提取功能,支持自动提取网页文本 - 添加文件路径识别和文件内容读取功能,支持private_upload路径格式 - 集成WebTextExtractor处理网页内容提取 - 集成TextContentLoader处理本地文件读取 - 支持文件路径与问题组合的智能处理 * back * block unstable --------- Co-authored-by: XiaoBoAI <liuboyin2019@ia.ac.cn>
这个提交包含在:
@@ -0,0 +1,211 @@
|
||||
import re
|
||||
import os
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class ExcelTableFormatter:
|
||||
"""聊天记录中Markdown表格转Excel生成器"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化Excel文档对象"""
|
||||
from openpyxl import Workbook
|
||||
self.workbook = Workbook()
|
||||
self._table_count = 0
|
||||
self._current_sheet = None
|
||||
|
||||
def _normalize_table_row(self, row):
|
||||
"""标准化表格行,处理不同的分隔符情况"""
|
||||
row = row.strip()
|
||||
if row.startswith('|'):
|
||||
row = row[1:]
|
||||
if row.endswith('|'):
|
||||
row = row[:-1]
|
||||
return [cell.strip() for cell in row.split('|')]
|
||||
|
||||
def _is_separator_row(self, row):
|
||||
"""检查是否是分隔行(由 - 或 : 组成)"""
|
||||
clean_row = re.sub(r'[\s|]', '', row)
|
||||
return bool(re.match(r'^[-:]+$', clean_row))
|
||||
|
||||
def _extract_tables_from_text(self, text):
|
||||
"""从文本中提取所有表格内容"""
|
||||
if not isinstance(text, str):
|
||||
return []
|
||||
|
||||
tables = []
|
||||
current_table = []
|
||||
is_in_table = False
|
||||
|
||||
for line in text.split('\n'):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
if is_in_table and current_table:
|
||||
if len(current_table) >= 2:
|
||||
tables.append(current_table)
|
||||
current_table = []
|
||||
is_in_table = False
|
||||
continue
|
||||
|
||||
if '|' in line:
|
||||
if not is_in_table:
|
||||
is_in_table = True
|
||||
current_table.append(line)
|
||||
else:
|
||||
if is_in_table and current_table:
|
||||
if len(current_table) >= 2:
|
||||
tables.append(current_table)
|
||||
current_table = []
|
||||
is_in_table = False
|
||||
|
||||
if is_in_table and current_table and len(current_table) >= 2:
|
||||
tables.append(current_table)
|
||||
|
||||
return tables
|
||||
|
||||
def _parse_table(self, table_lines):
|
||||
"""解析表格内容为结构化数据"""
|
||||
try:
|
||||
headers = self._normalize_table_row(table_lines[0])
|
||||
|
||||
separator_index = next(
|
||||
(i for i, line in enumerate(table_lines) if self._is_separator_row(line)),
|
||||
1
|
||||
)
|
||||
|
||||
data_rows = []
|
||||
for line in table_lines[separator_index + 1:]:
|
||||
cells = self._normalize_table_row(line)
|
||||
# 确保单元格数量与表头一致
|
||||
while len(cells) < len(headers):
|
||||
cells.append('')
|
||||
cells = cells[:len(headers)]
|
||||
data_rows.append(cells)
|
||||
|
||||
if headers and data_rows:
|
||||
return {
|
||||
'headers': headers,
|
||||
'data': data_rows
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"解析表格时发生错误: {str(e)}")
|
||||
|
||||
return None
|
||||
|
||||
def _create_sheet(self, question_num, table_num):
|
||||
"""创建新的工作表"""
|
||||
sheet_name = f'Q{question_num}_T{table_num}'
|
||||
if len(sheet_name) > 31:
|
||||
sheet_name = f'Table{self._table_count}'
|
||||
|
||||
if sheet_name in self.workbook.sheetnames:
|
||||
sheet_name = f'{sheet_name}_{datetime.now().strftime("%H%M%S")}'
|
||||
|
||||
return self.workbook.create_sheet(title=sheet_name)
|
||||
|
||||
def create_document(self, history):
|
||||
"""
|
||||
处理聊天历史中的所有表格并创建Excel文档
|
||||
|
||||
Args:
|
||||
history: 聊天历史列表
|
||||
|
||||
Returns:
|
||||
Workbook: 处理完成的Excel工作簿对象,如果没有表格则返回None
|
||||
"""
|
||||
has_tables = False
|
||||
|
||||
# 删除默认创建的工作表
|
||||
default_sheet = self.workbook['Sheet']
|
||||
self.workbook.remove(default_sheet)
|
||||
|
||||
# 遍历所有回答
|
||||
for i in range(1, len(history), 2):
|
||||
answer = history[i]
|
||||
tables = self._extract_tables_from_text(answer)
|
||||
|
||||
for table_lines in tables:
|
||||
parsed_table = self._parse_table(table_lines)
|
||||
if parsed_table:
|
||||
self._table_count += 1
|
||||
sheet = self._create_sheet(i // 2 + 1, self._table_count)
|
||||
|
||||
# 写入表头
|
||||
for col, header in enumerate(parsed_table['headers'], 1):
|
||||
sheet.cell(row=1, column=col, value=header)
|
||||
|
||||
# 写入数据
|
||||
for row_idx, row_data in enumerate(parsed_table['data'], 2):
|
||||
for col_idx, value in enumerate(row_data, 1):
|
||||
sheet.cell(row=row_idx, column=col_idx, value=value)
|
||||
|
||||
has_tables = True
|
||||
|
||||
return self.workbook if has_tables else None
|
||||
|
||||
|
||||
def save_chat_tables(history, save_dir, base_name):
|
||||
"""
|
||||
保存聊天历史中的表格到Excel文件
|
||||
|
||||
Args:
|
||||
history: 聊天历史列表
|
||||
save_dir: 保存目录
|
||||
base_name: 基础文件名
|
||||
|
||||
Returns:
|
||||
list: 保存的文件路径列表
|
||||
"""
|
||||
result_files = []
|
||||
|
||||
try:
|
||||
# 创建Excel格式
|
||||
excel_formatter = ExcelTableFormatter()
|
||||
workbook = excel_formatter.create_document(history)
|
||||
|
||||
if workbook is not None:
|
||||
# 确保保存目录存在
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
# 生成Excel文件路径
|
||||
excel_file = os.path.join(save_dir, base_name + '.xlsx')
|
||||
|
||||
# 保存Excel文件
|
||||
workbook.save(excel_file)
|
||||
result_files.append(excel_file)
|
||||
print(f"已保存表格到Excel文件: {excel_file}")
|
||||
except Exception as e:
|
||||
print(f"保存Excel格式失败: {str(e)}")
|
||||
|
||||
return result_files
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 示例聊天历史
|
||||
history = [
|
||||
"问题1",
|
||||
"""这是第一个表格:
|
||||
| A | B | C |
|
||||
|---|---|---|
|
||||
| 1 | 2 | 3 |""",
|
||||
|
||||
"问题2",
|
||||
"这是没有表格的回答",
|
||||
|
||||
"问题3",
|
||||
"""回答包含多个表格:
|
||||
| Name | Age |
|
||||
|------|-----|
|
||||
| Tom | 20 |
|
||||
|
||||
第二个表格:
|
||||
| X | Y |
|
||||
|---|---|
|
||||
| 1 | 2 |"""
|
||||
]
|
||||
|
||||
# 保存表格
|
||||
save_dir = "output"
|
||||
base_name = "chat_tables"
|
||||
saved_files = save_chat_tables(history, save_dir, base_name)
|
||||
在新工单中引用
屏蔽一个用户