镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-08 23:46:48 +00:00
implement doc_fns
这个提交包含在:
@@ -0,0 +1,211 @@
|
||||
import re
|
||||
import os
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from openpyxl import Workbook
|
||||
|
||||
|
||||
class ExcelTableFormatter:
|
||||
"""聊天记录中Markdown表格转Excel生成器"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化Excel文档对象"""
|
||||
self.workbook = Workbook()
|
||||
self._table_count = 0
|
||||
self._current_sheet = None
|
||||
|
||||
def _normalize_table_row(self, row):
|
||||
"""标准化表格行,处理不同的分隔符情况"""
|
||||
row = row.strip()
|
||||
if row.startswith('|'):
|
||||
row = row[1:]
|
||||
if row.endswith('|'):
|
||||
row = row[:-1]
|
||||
return [cell.strip() for cell in row.split('|')]
|
||||
|
||||
def _is_separator_row(self, row):
|
||||
"""检查是否是分隔行(由 - 或 : 组成)"""
|
||||
clean_row = re.sub(r'[\s|]', '', row)
|
||||
return bool(re.match(r'^[-:]+$', clean_row))
|
||||
|
||||
def _extract_tables_from_text(self, text):
|
||||
"""从文本中提取所有表格内容"""
|
||||
if not isinstance(text, str):
|
||||
return []
|
||||
|
||||
tables = []
|
||||
current_table = []
|
||||
is_in_table = False
|
||||
|
||||
for line in text.split('\n'):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
if is_in_table and current_table:
|
||||
if len(current_table) >= 2:
|
||||
tables.append(current_table)
|
||||
current_table = []
|
||||
is_in_table = False
|
||||
continue
|
||||
|
||||
if '|' in line:
|
||||
if not is_in_table:
|
||||
is_in_table = True
|
||||
current_table.append(line)
|
||||
else:
|
||||
if is_in_table and current_table:
|
||||
if len(current_table) >= 2:
|
||||
tables.append(current_table)
|
||||
current_table = []
|
||||
is_in_table = False
|
||||
|
||||
if is_in_table and current_table and len(current_table) >= 2:
|
||||
tables.append(current_table)
|
||||
|
||||
return tables
|
||||
|
||||
def _parse_table(self, table_lines):
|
||||
"""解析表格内容为结构化数据"""
|
||||
try:
|
||||
headers = self._normalize_table_row(table_lines[0])
|
||||
|
||||
separator_index = next(
|
||||
(i for i, line in enumerate(table_lines) if self._is_separator_row(line)),
|
||||
1
|
||||
)
|
||||
|
||||
data_rows = []
|
||||
for line in table_lines[separator_index + 1:]:
|
||||
cells = self._normalize_table_row(line)
|
||||
# 确保单元格数量与表头一致
|
||||
while len(cells) < len(headers):
|
||||
cells.append('')
|
||||
cells = cells[:len(headers)]
|
||||
data_rows.append(cells)
|
||||
|
||||
if headers and data_rows:
|
||||
return {
|
||||
'headers': headers,
|
||||
'data': data_rows
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"解析表格时发生错误: {str(e)}")
|
||||
|
||||
return None
|
||||
|
||||
def _create_sheet(self, question_num, table_num):
|
||||
"""创建新的工作表"""
|
||||
sheet_name = f'Q{question_num}_T{table_num}'
|
||||
if len(sheet_name) > 31:
|
||||
sheet_name = f'Table{self._table_count}'
|
||||
|
||||
if sheet_name in self.workbook.sheetnames:
|
||||
sheet_name = f'{sheet_name}_{datetime.now().strftime("%H%M%S")}'
|
||||
|
||||
return self.workbook.create_sheet(title=sheet_name)
|
||||
|
||||
def create_document(self, history):
|
||||
"""
|
||||
处理聊天历史中的所有表格并创建Excel文档
|
||||
|
||||
Args:
|
||||
history: 聊天历史列表
|
||||
|
||||
Returns:
|
||||
Workbook: 处理完成的Excel工作簿对象,如果没有表格则返回None
|
||||
"""
|
||||
has_tables = False
|
||||
|
||||
# 删除默认创建的工作表
|
||||
default_sheet = self.workbook['Sheet']
|
||||
self.workbook.remove(default_sheet)
|
||||
|
||||
# 遍历所有回答
|
||||
for i in range(1, len(history), 2):
|
||||
answer = history[i]
|
||||
tables = self._extract_tables_from_text(answer)
|
||||
|
||||
for table_lines in tables:
|
||||
parsed_table = self._parse_table(table_lines)
|
||||
if parsed_table:
|
||||
self._table_count += 1
|
||||
sheet = self._create_sheet(i // 2 + 1, self._table_count)
|
||||
|
||||
# 写入表头
|
||||
for col, header in enumerate(parsed_table['headers'], 1):
|
||||
sheet.cell(row=1, column=col, value=header)
|
||||
|
||||
# 写入数据
|
||||
for row_idx, row_data in enumerate(parsed_table['data'], 2):
|
||||
for col_idx, value in enumerate(row_data, 1):
|
||||
sheet.cell(row=row_idx, column=col_idx, value=value)
|
||||
|
||||
has_tables = True
|
||||
|
||||
return self.workbook if has_tables else None
|
||||
|
||||
|
||||
def save_chat_tables(history, save_dir, base_name):
|
||||
"""
|
||||
保存聊天历史中的表格到Excel文件
|
||||
|
||||
Args:
|
||||
history: 聊天历史列表
|
||||
save_dir: 保存目录
|
||||
base_name: 基础文件名
|
||||
|
||||
Returns:
|
||||
list: 保存的文件路径列表
|
||||
"""
|
||||
result_files = []
|
||||
|
||||
try:
|
||||
# 创建Excel格式
|
||||
excel_formatter = ExcelTableFormatter()
|
||||
workbook = excel_formatter.create_document(history)
|
||||
|
||||
if workbook is not None:
|
||||
# 确保保存目录存在
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
# 生成Excel文件路径
|
||||
excel_file = os.path.join(save_dir, base_name + '.xlsx')
|
||||
|
||||
# 保存Excel文件
|
||||
workbook.save(excel_file)
|
||||
result_files.append(excel_file)
|
||||
print(f"已保存表格到Excel文件: {excel_file}")
|
||||
except Exception as e:
|
||||
print(f"保存Excel格式失败: {str(e)}")
|
||||
|
||||
return result_files
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 示例聊天历史
|
||||
history = [
|
||||
"问题1",
|
||||
"""这是第一个表格:
|
||||
| A | B | C |
|
||||
|---|---|---|
|
||||
| 1 | 2 | 3 |""",
|
||||
|
||||
"问题2",
|
||||
"这是没有表格的回答",
|
||||
|
||||
"问题3",
|
||||
"""回答包含多个表格:
|
||||
| Name | Age |
|
||||
|------|-----|
|
||||
| Tom | 20 |
|
||||
|
||||
第二个表格:
|
||||
| X | Y |
|
||||
|---|---|
|
||||
| 1 | 2 |"""
|
||||
]
|
||||
|
||||
# 保存表格
|
||||
save_dir = "output"
|
||||
base_name = "chat_tables"
|
||||
saved_files = save_chat_tables(history, save_dir, base_name)
|
||||
@@ -0,0 +1,190 @@
|
||||
|
||||
|
||||
class HtmlFormatter:
|
||||
"""聊天记录HTML格式生成器"""
|
||||
|
||||
def __init__(self, chatbot, history):
|
||||
self.chatbot = chatbot
|
||||
self.history = history
|
||||
self.css_styles = """
|
||||
:root {
|
||||
--primary-color: #2563eb;
|
||||
--primary-light: #eff6ff;
|
||||
--secondary-color: #1e293b;
|
||||
--background-color: #f8fafc;
|
||||
--text-color: #334155;
|
||||
--border-color: #e2e8f0;
|
||||
--card-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
line-height: 1.8;
|
||||
margin: 0;
|
||||
padding: 2rem;
|
||||
color: var(--text-color);
|
||||
background-color: var(--background-color);
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
padding: 2rem;
|
||||
border-radius: 16px;
|
||||
box-shadow: var(--card-shadow);
|
||||
}
|
||||
::selection {
|
||||
background: var(--primary-light);
|
||||
color: var(--primary-color);
|
||||
}
|
||||
@keyframes fadeIn {
|
||||
from { opacity: 0; transform: translateY(20px); }
|
||||
to { opacity: 1; transform: translateY(0); }
|
||||
}
|
||||
|
||||
@keyframes slideIn {
|
||||
from { transform: translateX(-20px); opacity: 0; }
|
||||
to { transform: translateX(0); opacity: 1; }
|
||||
}
|
||||
|
||||
.container {
|
||||
animation: fadeIn 0.6s ease-out;
|
||||
}
|
||||
|
||||
.QaBox {
|
||||
animation: slideIn 0.5s ease-out;
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
|
||||
.QaBox:hover {
|
||||
transform: translateX(5px);
|
||||
}
|
||||
.Question, .Answer, .historyBox {
|
||||
transition: all 0.3s ease;
|
||||
}
|
||||
.chat-title {
|
||||
color: var(--primary-color);
|
||||
font-size: 2em;
|
||||
text-align: center;
|
||||
margin: 1rem 0 2rem;
|
||||
padding-bottom: 1rem;
|
||||
border-bottom: 2px solid var(--primary-color);
|
||||
}
|
||||
|
||||
.chat-body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1.5rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.QaBox {
|
||||
background: white;
|
||||
padding: 1.5rem;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid var(--primary-color);
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
margin-bottom: 1.5rem;
|
||||
}
|
||||
|
||||
.Question {
|
||||
color: var(--secondary-color);
|
||||
font-weight: 500;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.Answer {
|
||||
color: var(--text-color);
|
||||
background: var(--primary-light);
|
||||
padding: 1rem;
|
||||
border-radius: 6px;
|
||||
}
|
||||
|
||||
.history-section {
|
||||
margin-top: 3rem;
|
||||
padding-top: 2rem;
|
||||
border-top: 2px solid var(--border-color);
|
||||
}
|
||||
|
||||
.history-title {
|
||||
color: var(--secondary-color);
|
||||
font-size: 1.5em;
|
||||
margin-bottom: 1.5rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.historyBox {
|
||||
background: white;
|
||||
padding: 1rem;
|
||||
margin: 0.5rem 0;
|
||||
border-radius: 6px;
|
||||
border: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
@media (prefers-color-scheme: dark) {
|
||||
:root {
|
||||
--background-color: #0f172a;
|
||||
--text-color: #e2e8f0;
|
||||
--border-color: #1e293b;
|
||||
}
|
||||
|
||||
.container, .QaBox {
|
||||
background: #1e293b;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
def format_chat_content(self) -> str:
|
||||
"""格式化聊天内容"""
|
||||
chat_content = []
|
||||
for q, a in self.chatbot:
|
||||
question = str(q) if q is not None else ""
|
||||
answer = str(a) if a is not None else ""
|
||||
chat_content.append(f'''
|
||||
<div class="QaBox">
|
||||
<div class="Question">{question}</div>
|
||||
<div class="Answer">{answer}</div>
|
||||
</div>
|
||||
''')
|
||||
return "\n".join(chat_content)
|
||||
|
||||
def format_history_content(self) -> str:
|
||||
"""格式化历史记录内容"""
|
||||
if not self.history:
|
||||
return ""
|
||||
|
||||
history_content = []
|
||||
for entry in self.history:
|
||||
history_content.append(f'''
|
||||
<div class="historyBox">
|
||||
<div class="entry">{entry}</div>
|
||||
</div>
|
||||
''')
|
||||
return "\n".join(history_content)
|
||||
|
||||
def create_document(self) -> str:
|
||||
"""生成完整的HTML文档
|
||||
|
||||
Returns:
|
||||
str: 完整的HTML文档字符串
|
||||
"""
|
||||
return f"""
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>对话存档</title>
|
||||
<style>{self.css_styles}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1 class="chat-title">对话存档</h1>
|
||||
<div class="chat-body">
|
||||
{self.format_chat_content()}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
@@ -0,0 +1,39 @@
|
||||
|
||||
class MarkdownFormatter:
|
||||
"""Markdown格式文档生成器 - 用于生成对话记录的markdown文档"""
|
||||
|
||||
def __init__(self):
|
||||
self.content = []
|
||||
|
||||
def _add_content(self, text: str):
|
||||
"""添加正文内容"""
|
||||
if text:
|
||||
self.content.append(f"\n{text}\n")
|
||||
|
||||
def create_document(self, history: list) -> str:
|
||||
"""
|
||||
创建完整的Markdown文档
|
||||
Args:
|
||||
history: 历史记录列表,偶数位置为问题,奇数位置为答案
|
||||
Returns:
|
||||
str: 生成的Markdown文本
|
||||
"""
|
||||
self.content = []
|
||||
|
||||
# 处理问答对
|
||||
for i in range(0, len(history), 2):
|
||||
question = history[i]
|
||||
answer = history[i + 1]
|
||||
|
||||
# 添加问题
|
||||
self.content.append(f"\n### 问题 {i//2 + 1}")
|
||||
self._add_content(question)
|
||||
|
||||
# 添加回答
|
||||
self.content.append(f"\n### 回答 {i//2 + 1}")
|
||||
self._add_content(answer)
|
||||
|
||||
# 添加分隔线
|
||||
self.content.append("\n---\n")
|
||||
|
||||
return "\n".join(self.content)
|
||||
@@ -0,0 +1,172 @@
|
||||
from datetime import datetime
|
||||
import os
|
||||
import re
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
|
||||
def convert_markdown_to_pdf(markdown_text):
|
||||
"""将Markdown文本转换为PDF格式的纯文本"""
|
||||
if not markdown_text:
|
||||
return ""
|
||||
|
||||
# 标准化换行符
|
||||
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 处理标题、粗体、斜体
|
||||
markdown_text = re.sub(r'^#\s+(.+)$', r'\1', markdown_text, flags=re.MULTILINE)
|
||||
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text)
|
||||
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text)
|
||||
|
||||
# 处理列表
|
||||
markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'• \1', markdown_text, flags=re.MULTILINE)
|
||||
markdown_text = re.sub(r'^\s*\d+\.\s+(.+?)(?=\n|$)', r'\1', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 处理链接
|
||||
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', markdown_text)
|
||||
|
||||
# 处理段落
|
||||
markdown_text = re.sub(r'\n{2,}', '\n', markdown_text)
|
||||
markdown_text = re.sub(r'(?<!\n)(?<!^)(?<!•\s)(?<!\d\.\s)\n(?![\s•\d])', '\n\n', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 清理空白
|
||||
markdown_text = re.sub(r' +', ' ', markdown_text)
|
||||
markdown_text = re.sub(r'(?m)^\s+|\s+$', '', markdown_text)
|
||||
|
||||
return markdown_text.strip()
|
||||
|
||||
class PDFFormatter:
|
||||
"""聊天记录PDF文档生成器 - 使用 Noto Sans CJK 字体"""
|
||||
|
||||
def __init__(self):
|
||||
self._init_reportlab()
|
||||
self._register_fonts()
|
||||
self.styles = self._get_reportlab_lib()['getSampleStyleSheet']()
|
||||
self._create_styles()
|
||||
|
||||
def _init_reportlab(self):
|
||||
"""初始化 ReportLab 相关组件"""
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
from reportlab.lib.units import cm
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
||||
|
||||
self._lib = {
|
||||
'A4': A4,
|
||||
'getSampleStyleSheet': getSampleStyleSheet,
|
||||
'ParagraphStyle': ParagraphStyle,
|
||||
'cm': cm
|
||||
}
|
||||
|
||||
self._platypus = {
|
||||
'SimpleDocTemplate': SimpleDocTemplate,
|
||||
'Paragraph': Paragraph,
|
||||
'Spacer': Spacer
|
||||
}
|
||||
|
||||
def _get_reportlab_lib(self):
|
||||
return self._lib
|
||||
|
||||
def _get_reportlab_platypus(self):
|
||||
return self._platypus
|
||||
|
||||
def _register_fonts(self):
|
||||
"""注册 Noto Sans CJK 字体"""
|
||||
possible_font_paths = [
|
||||
'/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc',
|
||||
'/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc',
|
||||
'/usr/share/fonts/noto/NotoSansCJK-Regular.ttc'
|
||||
]
|
||||
|
||||
font_registered = False
|
||||
for path in possible_font_paths:
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
pdfmetrics.registerFont(TTFont('NotoSansCJK', path))
|
||||
font_registered = True
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if not font_registered:
|
||||
print("Warning: Could not find Noto Sans CJK font. Using fallback font.")
|
||||
self.font_name = 'Helvetica'
|
||||
else:
|
||||
self.font_name = 'NotoSansCJK'
|
||||
|
||||
def _create_styles(self):
|
||||
"""创建文档样式"""
|
||||
ParagraphStyle = self._lib['ParagraphStyle']
|
||||
|
||||
# 标题样式
|
||||
self.styles.add(ParagraphStyle(
|
||||
name='Title_Custom',
|
||||
fontName=self.font_name,
|
||||
fontSize=24,
|
||||
leading=38,
|
||||
alignment=1,
|
||||
spaceAfter=32
|
||||
))
|
||||
|
||||
# 日期样式
|
||||
self.styles.add(ParagraphStyle(
|
||||
name='Date_Style',
|
||||
fontName=self.font_name,
|
||||
fontSize=16,
|
||||
leading=20,
|
||||
alignment=1,
|
||||
spaceAfter=20
|
||||
))
|
||||
|
||||
# 问题样式
|
||||
self.styles.add(ParagraphStyle(
|
||||
name='Question_Style',
|
||||
fontName=self.font_name,
|
||||
fontSize=12,
|
||||
leading=18,
|
||||
leftIndent=28,
|
||||
spaceAfter=6
|
||||
))
|
||||
|
||||
# 回答样式
|
||||
self.styles.add(ParagraphStyle(
|
||||
name='Answer_Style',
|
||||
fontName=self.font_name,
|
||||
fontSize=12,
|
||||
leading=18,
|
||||
leftIndent=28,
|
||||
spaceAfter=12
|
||||
))
|
||||
|
||||
def create_document(self, history, output_path):
|
||||
"""生成PDF文档"""
|
||||
# 创建PDF文档
|
||||
doc = self._platypus['SimpleDocTemplate'](
|
||||
output_path,
|
||||
pagesize=self._lib['A4'],
|
||||
rightMargin=2.6 * self._lib['cm'],
|
||||
leftMargin=2.8 * self._lib['cm'],
|
||||
topMargin=3.7 * self._lib['cm'],
|
||||
bottomMargin=3.5 * self._lib['cm']
|
||||
)
|
||||
|
||||
# 构建内容
|
||||
story = []
|
||||
Paragraph = self._platypus['Paragraph']
|
||||
|
||||
# 添加对话内容
|
||||
for i in range(0, len(history), 2):
|
||||
question = history[i]
|
||||
answer = convert_markdown_to_pdf(history[i + 1]) if i + 1 < len(history) else ""
|
||||
|
||||
if question:
|
||||
q_text = f'问题 {i // 2 + 1}:{str(question)}'
|
||||
story.append(Paragraph(q_text, self.styles['Question_Style']))
|
||||
|
||||
if answer:
|
||||
a_text = f'回答 {i // 2 + 1}:{str(answer)}'
|
||||
story.append(Paragraph(a_text, self.styles['Answer_Style']))
|
||||
|
||||
# 构建PDF
|
||||
doc.build(story)
|
||||
|
||||
return doc
|
||||
@@ -0,0 +1,79 @@
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def convert_markdown_to_txt(markdown_text):
|
||||
"""Convert markdown text to plain text while preserving formatting"""
|
||||
# Standardize line endings
|
||||
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 1. Handle headers but keep their formatting instead of removing them
|
||||
markdown_text = re.sub(r'^#\s+(.+)$', r'# \1', markdown_text, flags=re.MULTILINE)
|
||||
markdown_text = re.sub(r'^##\s+(.+)$', r'## \1', markdown_text, flags=re.MULTILINE)
|
||||
markdown_text = re.sub(r'^###\s+(.+)$', r'### \1', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 2. Handle bold and italic - simply remove markers
|
||||
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text)
|
||||
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text)
|
||||
|
||||
# 3. Handle lists but preserve formatting
|
||||
markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'• \1', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 4. Handle links - keep only the text
|
||||
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 (\2)', markdown_text)
|
||||
|
||||
# 5. Handle HTML links - convert to user-friendly format
|
||||
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)',
|
||||
markdown_text)
|
||||
|
||||
# 6. Preserve paragraph breaks
|
||||
markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text) # normalize multiple newlines to double newlines
|
||||
|
||||
# 7. Clean up extra spaces but maintain indentation
|
||||
markdown_text = re.sub(r' +', ' ', markdown_text)
|
||||
|
||||
return markdown_text.strip()
|
||||
|
||||
|
||||
class TxtFormatter:
|
||||
"""Chat history TXT document generator"""
|
||||
|
||||
def __init__(self):
|
||||
self.content = []
|
||||
self._setup_document()
|
||||
|
||||
def _setup_document(self):
|
||||
"""Initialize document with header"""
|
||||
self.content.append("=" * 50)
|
||||
self.content.append("GPT-Academic对话记录".center(48))
|
||||
self.content.append("=" * 50)
|
||||
|
||||
def _format_header(self):
|
||||
"""Create document header with current date"""
|
||||
from datetime import datetime
|
||||
date_str = datetime.now().strftime('%Y年%m月%d日')
|
||||
return [
|
||||
date_str.center(48),
|
||||
"\n" # Add blank line after date
|
||||
]
|
||||
|
||||
def create_document(self, history):
|
||||
"""Generate document from chat history"""
|
||||
# Add header with date
|
||||
self.content.extend(self._format_header())
|
||||
|
||||
# Add conversation content
|
||||
for i in range(0, len(history), 2):
|
||||
question = history[i]
|
||||
answer = convert_markdown_to_txt(history[i + 1]) if i + 1 < len(history) else ""
|
||||
|
||||
if question:
|
||||
self.content.append(f"问题 {i // 2 + 1}:{str(question)}")
|
||||
self.content.append("") # Add blank line
|
||||
|
||||
if answer:
|
||||
self.content.append(f"回答 {i // 2 + 1}:{str(answer)}")
|
||||
self.content.append("") # Add blank line
|
||||
|
||||
# Join all content with newlines
|
||||
return "\n".join(self.content)
|
||||
@@ -0,0 +1,155 @@
|
||||
from docx2pdf import convert
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from typing import Union
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
class WordToPdfConverter:
|
||||
"""Word文档转PDF转换器"""
|
||||
|
||||
@staticmethod
|
||||
def convert_to_pdf(word_path: Union[str, Path], pdf_path: Union[str, Path] = None) -> str:
|
||||
"""
|
||||
将Word文档转换为PDF
|
||||
|
||||
参数:
|
||||
word_path: Word文档的路径
|
||||
pdf_path: 可选,PDF文件的输出路径。如果未指定,将使用与Word文档相同的名称和位置
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径
|
||||
|
||||
异常:
|
||||
如果转换失败,将抛出相应异常
|
||||
"""
|
||||
try:
|
||||
# 确保输入路径是Path对象
|
||||
word_path = Path(word_path)
|
||||
|
||||
# 如果未指定pdf_path,则使用与word文档相同的名称
|
||||
if pdf_path is None:
|
||||
pdf_path = word_path.with_suffix('.pdf')
|
||||
else:
|
||||
pdf_path = Path(pdf_path)
|
||||
|
||||
# 检查操作系统
|
||||
if platform.system() == 'Linux':
|
||||
# Linux系统需要安装libreoffice
|
||||
which_result = subprocess.run(['which', 'libreoffice'], capture_output=True, text=True)
|
||||
if which_result.returncode != 0:
|
||||
raise RuntimeError("请先安装LibreOffice: sudo apt-get install libreoffice")
|
||||
|
||||
print(f"开始转换Word文档: {word_path} 到 PDF")
|
||||
|
||||
# 使用subprocess代替os.system
|
||||
result = subprocess.run(
|
||||
['libreoffice', '--headless', '--convert-to', 'pdf:writer_pdf_Export',
|
||||
str(word_path), '--outdir', str(pdf_path.parent)],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = result.stderr or "未知错误"
|
||||
print(f"LibreOffice转换失败,错误信息: {error_msg}")
|
||||
raise RuntimeError(f"LibreOffice转换失败: {error_msg}")
|
||||
|
||||
print(f"LibreOffice转换输出: {result.stdout}")
|
||||
|
||||
# 如果输出路径与默认生成的不同,则重命名
|
||||
default_pdf = word_path.with_suffix('.pdf')
|
||||
if default_pdf != pdf_path and default_pdf.exists():
|
||||
os.rename(default_pdf, pdf_path)
|
||||
print(f"已将PDF从 {default_pdf} 重命名为 {pdf_path}")
|
||||
|
||||
# 验证PDF是否成功生成
|
||||
if not pdf_path.exists() or pdf_path.stat().st_size == 0:
|
||||
raise RuntimeError("PDF生成失败或文件为空")
|
||||
|
||||
print(f"PDF转换成功,文件大小: {pdf_path.stat().st_size} 字节")
|
||||
else:
|
||||
# Windows和MacOS使用docx2pdf
|
||||
print(f"使用docx2pdf转换 {word_path} 到 {pdf_path}")
|
||||
convert(word_path, pdf_path)
|
||||
|
||||
# 验证PDF是否成功生成
|
||||
if not pdf_path.exists() or pdf_path.stat().st_size == 0:
|
||||
raise RuntimeError("PDF生成失败或文件为空")
|
||||
|
||||
print(f"PDF转换成功,文件大小: {pdf_path.stat().st_size} 字节")
|
||||
|
||||
return str(pdf_path)
|
||||
|
||||
except Exception as e:
|
||||
print(f"PDF转换异常: {str(e)}")
|
||||
raise Exception(f"转换PDF失败: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def batch_convert(word_dir: Union[str, Path], pdf_dir: Union[str, Path] = None) -> list:
|
||||
"""
|
||||
批量转换目录下的所有Word文档
|
||||
|
||||
参数:
|
||||
word_dir: 包含Word文档的目录路径
|
||||
pdf_dir: 可选,PDF文件的输出目录。如果未指定,将使用与Word文档相同的目录
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径列表
|
||||
"""
|
||||
word_dir = Path(word_dir)
|
||||
if pdf_dir:
|
||||
pdf_dir = Path(pdf_dir)
|
||||
pdf_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
converted_files = []
|
||||
|
||||
for word_file in word_dir.glob("*.docx"):
|
||||
try:
|
||||
if pdf_dir:
|
||||
pdf_path = pdf_dir / word_file.with_suffix('.pdf').name
|
||||
else:
|
||||
pdf_path = word_file.with_suffix('.pdf')
|
||||
|
||||
pdf_file = WordToPdfConverter.convert_to_pdf(word_file, pdf_path)
|
||||
converted_files.append(pdf_file)
|
||||
|
||||
except Exception as e:
|
||||
print(f"转换 {word_file} 失败: {str(e)}")
|
||||
|
||||
return converted_files
|
||||
|
||||
@staticmethod
|
||||
def convert_doc_to_pdf(doc, output_dir: Union[str, Path] = None) -> str:
|
||||
"""
|
||||
将docx对象直接转换为PDF
|
||||
|
||||
参数:
|
||||
doc: python-docx的Document对象
|
||||
output_dir: 可选,输出目录。如果未指定,将使用当前目录
|
||||
|
||||
返回:
|
||||
生成的PDF文件路径
|
||||
"""
|
||||
try:
|
||||
# 设置临时文件路径和输出路径
|
||||
output_dir = Path(output_dir) if output_dir else Path.cwd()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 生成临时word文件
|
||||
temp_docx = output_dir / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
|
||||
doc.save(temp_docx)
|
||||
|
||||
# 转换为PDF
|
||||
pdf_path = temp_docx.with_suffix('.pdf')
|
||||
WordToPdfConverter.convert_to_pdf(temp_docx, pdf_path)
|
||||
|
||||
# 删除临时word文件
|
||||
temp_docx.unlink()
|
||||
|
||||
return str(pdf_path)
|
||||
|
||||
except Exception as e:
|
||||
if temp_docx.exists():
|
||||
temp_docx.unlink()
|
||||
raise Exception(f"转换PDF失败: {str(e)}")
|
||||
@@ -0,0 +1,177 @@
|
||||
import re
|
||||
from docx import Document
|
||||
from docx.shared import Cm, Pt
|
||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
|
||||
from docx.enum.style import WD_STYLE_TYPE
|
||||
from docx.oxml.ns import qn
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def convert_markdown_to_word(markdown_text):
|
||||
# 0. 首先标准化所有换行符为\n
|
||||
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 1. 处理标题 - 支持更多级别的标题,使用更精确的正则
|
||||
# 保留标题标记,以便后续处理时还能识别出标题级别
|
||||
markdown_text = re.sub(r'^(#{1,6})\s+(.+?)(?:\s+#+)?$', r'\1 \2', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 2. 处理粗体、斜体和加粗斜体
|
||||
markdown_text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', markdown_text) # 加粗斜体
|
||||
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text) # 加粗
|
||||
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text) # 斜体
|
||||
markdown_text = re.sub(r'_(.+?)_', r'\1', markdown_text) # 下划线斜体
|
||||
markdown_text = re.sub(r'__(.+?)__', r'\1', markdown_text) # 下划线加粗
|
||||
|
||||
# 3. 处理代码块 - 不移除,而是简化格式
|
||||
# 多行代码块
|
||||
markdown_text = re.sub(r'```(?:\w+)?\n([\s\S]*?)```', r'[代码块]\n\1[/代码块]', markdown_text)
|
||||
# 单行代码
|
||||
markdown_text = re.sub(r'`([^`]+)`', r'[代码]\1[/代码]', markdown_text)
|
||||
|
||||
# 4. 处理列表 - 保留列表结构
|
||||
# 匹配无序列表
|
||||
markdown_text = re.sub(r'^(\s*)[-*+]\s+(.+?)$', r'\1• \2', markdown_text, flags=re.MULTILINE)
|
||||
|
||||
# 5. 处理Markdown链接
|
||||
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+?)\s*(?:"[^"]*")?\)', r'\1 (\2)', markdown_text)
|
||||
|
||||
# 6. 处理HTML链接
|
||||
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)',
|
||||
markdown_text)
|
||||
|
||||
# 7. 处理图片
|
||||
markdown_text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'[图片:\1]', markdown_text)
|
||||
|
||||
return markdown_text
|
||||
|
||||
|
||||
class WordFormatter:
|
||||
"""聊天记录Word文档生成器 - 符合中国政府公文格式规范(GB/T 9704-2012)"""
|
||||
|
||||
def __init__(self):
|
||||
self.doc = Document()
|
||||
self._setup_document()
|
||||
self._create_styles()
|
||||
|
||||
def _setup_document(self):
|
||||
"""设置文档基本格式,包括页面设置和页眉"""
|
||||
sections = self.doc.sections
|
||||
for section in sections:
|
||||
# 设置页面大小为A4
|
||||
section.page_width = Cm(21)
|
||||
section.page_height = Cm(29.7)
|
||||
# 设置页边距
|
||||
section.top_margin = Cm(3.7) # 上边距37mm
|
||||
section.bottom_margin = Cm(3.5) # 下边距35mm
|
||||
section.left_margin = Cm(2.8) # 左边距28mm
|
||||
section.right_margin = Cm(2.6) # 右边距26mm
|
||||
# 设置页眉页脚距离
|
||||
section.header_distance = Cm(2.0)
|
||||
section.footer_distance = Cm(2.0)
|
||||
|
||||
# 添加页眉
|
||||
header = section.header
|
||||
header_para = header.paragraphs[0]
|
||||
header_para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
||||
header_run = header_para.add_run("GPT-Academic对话记录")
|
||||
header_run.font.name = '仿宋'
|
||||
header_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
header_run.font.size = Pt(9)
|
||||
|
||||
def _create_styles(self):
|
||||
"""创建文档样式"""
|
||||
# 创建正文样式
|
||||
style = self.doc.styles.add_style('Normal_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
style.font.name = '仿宋'
|
||||
style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
style.font.size = Pt(12) # 调整为12磅
|
||||
style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
style.paragraph_format.space_after = Pt(0)
|
||||
|
||||
# 创建问题样式
|
||||
question_style = self.doc.styles.add_style('Question_Style', WD_STYLE_TYPE.PARAGRAPH)
|
||||
question_style.font.name = '黑体'
|
||||
question_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
question_style.font.size = Pt(14) # 调整为14磅
|
||||
question_style.font.bold = True
|
||||
question_style.paragraph_format.space_before = Pt(12) # 减小段前距
|
||||
question_style.paragraph_format.space_after = Pt(6)
|
||||
question_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
question_style.paragraph_format.left_indent = Pt(0) # 移除左缩进
|
||||
|
||||
# 创建回答样式
|
||||
answer_style = self.doc.styles.add_style('Answer_Style', WD_STYLE_TYPE.PARAGRAPH)
|
||||
answer_style.font.name = '仿宋'
|
||||
answer_style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
answer_style.font.size = Pt(12) # 调整为12磅
|
||||
answer_style.paragraph_format.space_before = Pt(6)
|
||||
answer_style.paragraph_format.space_after = Pt(12)
|
||||
answer_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
answer_style.paragraph_format.left_indent = Pt(0) # 移除左缩进
|
||||
|
||||
# 创建标题样式
|
||||
title_style = self.doc.styles.add_style('Title_Custom', WD_STYLE_TYPE.PARAGRAPH)
|
||||
title_style.font.name = '黑体' # 改用黑体
|
||||
title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
title_style.font.size = Pt(22) # 调整为22磅
|
||||
title_style.font.bold = True
|
||||
title_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
title_style.paragraph_format.space_before = Pt(0)
|
||||
title_style.paragraph_format.space_after = Pt(24)
|
||||
title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
|
||||
# 添加参考文献样式
|
||||
ref_style = self.doc.styles.add_style('Reference_Style', WD_STYLE_TYPE.PARAGRAPH)
|
||||
ref_style.font.name = '宋体'
|
||||
ref_style._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
|
||||
ref_style.font.size = Pt(10.5) # 参考文献使用小号字体
|
||||
ref_style.paragraph_format.space_before = Pt(3)
|
||||
ref_style.paragraph_format.space_after = Pt(3)
|
||||
ref_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.SINGLE
|
||||
ref_style.paragraph_format.left_indent = Pt(21)
|
||||
ref_style.paragraph_format.first_line_indent = Pt(-21)
|
||||
|
||||
# 添加参考文献标题样式
|
||||
ref_title_style = self.doc.styles.add_style('Reference_Title_Style', WD_STYLE_TYPE.PARAGRAPH)
|
||||
ref_title_style.font.name = '黑体'
|
||||
ref_title_style._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
|
||||
ref_title_style.font.size = Pt(16)
|
||||
ref_title_style.font.bold = True
|
||||
ref_title_style.paragraph_format.space_before = Pt(24)
|
||||
ref_title_style.paragraph_format.space_after = Pt(12)
|
||||
ref_title_style.paragraph_format.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
||||
|
||||
def create_document(self, history):
|
||||
"""写入聊天历史"""
|
||||
# 添加标题
|
||||
title_para = self.doc.add_paragraph(style='Title_Custom')
|
||||
title_run = title_para.add_run('GPT-Academic 对话记录')
|
||||
|
||||
# 添加日期
|
||||
date_para = self.doc.add_paragraph()
|
||||
date_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||
date_run = date_para.add_run(datetime.now().strftime('%Y年%m月%d日'))
|
||||
date_run.font.name = '仿宋'
|
||||
date_run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋')
|
||||
date_run.font.size = Pt(16)
|
||||
|
||||
self.doc.add_paragraph() # 添加空行
|
||||
|
||||
# 添加对话内容
|
||||
for i in range(0, len(history), 2):
|
||||
question = history[i]
|
||||
answer = convert_markdown_to_word(history[i + 1])
|
||||
|
||||
if question:
|
||||
q_para = self.doc.add_paragraph(style='Question_Style')
|
||||
q_para.add_run(f'问题 {i//2 + 1}:').bold = True
|
||||
q_para.add_run(str(question))
|
||||
|
||||
if answer:
|
||||
a_para = self.doc.add_paragraph(style='Answer_Style')
|
||||
a_para.add_run(f'回答 {i//2 + 1}:').bold = True
|
||||
a_para.add_run(str(answer))
|
||||
|
||||
|
||||
return self.doc
|
||||
|
||||
在新工单中引用
屏蔽一个用户