镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 06:26:47 +00:00
80 行
2.9 KiB
Python
80 行
2.9 KiB
Python
|
||
import re
|
||
|
||
|
||
def convert_markdown_to_txt(markdown_text):
|
||
"""Convert markdown text to plain text while preserving formatting"""
|
||
# Standardize line endings
|
||
markdown_text = markdown_text.replace('\r\n', '\n').replace('\r', '\n')
|
||
|
||
# 1. Handle headers but keep their formatting instead of removing them
|
||
markdown_text = re.sub(r'^#\s+(.+)$', r'# \1', markdown_text, flags=re.MULTILINE)
|
||
markdown_text = re.sub(r'^##\s+(.+)$', r'## \1', markdown_text, flags=re.MULTILINE)
|
||
markdown_text = re.sub(r'^###\s+(.+)$', r'### \1', markdown_text, flags=re.MULTILINE)
|
||
|
||
# 2. Handle bold and italic - simply remove markers
|
||
markdown_text = re.sub(r'\*\*(.+?)\*\*', r'\1', markdown_text)
|
||
markdown_text = re.sub(r'\*(.+?)\*', r'\1', markdown_text)
|
||
|
||
# 3. Handle lists but preserve formatting
|
||
markdown_text = re.sub(r'^\s*[-*+]\s+(.+?)(?=\n|$)', r'• \1', markdown_text, flags=re.MULTILINE)
|
||
|
||
# 4. Handle links - keep only the text
|
||
markdown_text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1 (\2)', markdown_text)
|
||
|
||
# 5. Handle HTML links - convert to user-friendly format
|
||
markdown_text = re.sub(r'<a href=[\'"]([^\'"]+)[\'"](?:\s+target=[\'"][^\'"]+[\'"])?>([^<]+)</a>', r'\2 (\1)',
|
||
markdown_text)
|
||
|
||
# 6. Preserve paragraph breaks
|
||
markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text) # normalize multiple newlines to double newlines
|
||
|
||
# 7. Clean up extra spaces but maintain indentation
|
||
markdown_text = re.sub(r' +', ' ', markdown_text)
|
||
|
||
return markdown_text.strip()
|
||
|
||
|
||
class TxtFormatter:
|
||
"""Chat history TXT document generator"""
|
||
|
||
def __init__(self):
|
||
self.content = []
|
||
self._setup_document()
|
||
|
||
def _setup_document(self):
|
||
"""Initialize document with header"""
|
||
self.content.append("=" * 50)
|
||
self.content.append("GPT-Academic对话记录".center(48))
|
||
self.content.append("=" * 50)
|
||
|
||
def _format_header(self):
|
||
"""Create document header with current date"""
|
||
from datetime import datetime
|
||
date_str = datetime.now().strftime('%Y年%m月%d日')
|
||
return [
|
||
date_str.center(48),
|
||
"\n" # Add blank line after date
|
||
]
|
||
|
||
def create_document(self, history):
|
||
"""Generate document from chat history"""
|
||
# Add header with date
|
||
self.content.extend(self._format_header())
|
||
|
||
# Add conversation content
|
||
for i in range(0, len(history), 2):
|
||
question = history[i]
|
||
answer = convert_markdown_to_txt(history[i + 1]) if i + 1 < len(history) else ""
|
||
|
||
if question:
|
||
self.content.append(f"问题 {i // 2 + 1}:{str(question)}")
|
||
self.content.append("") # Add blank line
|
||
|
||
if answer:
|
||
self.content.append(f"回答 {i // 2 + 1}:{str(answer)}")
|
||
self.content.append("") # Add blank line
|
||
|
||
# Join all content with newlines
|
||
return "\n".join(self.content)
|