这个提交包含在:
binary-husky
2024-02-25 22:16:46 +08:00
父节点 47289f863d
当前提交 d0703ef32d
共有 96 个文件被更改,包括 7507 次插入2453 次删除

查看文件

@@ -4,62 +4,47 @@ import os
import math
from textwrap import dedent
from functools import lru_cache
from pymdownx.superfences import fence_div_format, fence_code_format
from pymdownx.superfences import fence_code_format
from latex2mathml.converter import convert as tex2mathml
from shared_utils.config_loader import get_conf as get_conf
pj = os.path.join
default_user_name = 'default_user'
from shared_utils.text_mask import apply_gpt_academic_string_mask
markdown_extension_configs = {
'mdx_math': {
'enable_dollar_delimiter': True,
'use_gitlab_delimiters': False,
"mdx_math": {
"enable_dollar_delimiter": True,
"use_gitlab_delimiters": False,
},
}
code_highlight_configs = {
"pymdownx.superfences": {
'css_class': 'codehilite',
"css_class": "codehilite",
"custom_fences": [
{
'name': 'mermaid',
'class': 'mermaid',
'format': fence_code_format
}
]
{"name": "mermaid", "class": "mermaid", "format": fence_code_format}
],
},
"pymdownx.highlight": {
'css_class': 'codehilite',
'guess_lang': True,
"css_class": "codehilite",
"guess_lang": True,
# 'auto_title': True,
# 'linenums': True
}
},
}
def text_divide_paragraph(text):
"""
将文本按照段落分隔符分割开,生成带有段落标签的HTML代码。
"""
pre = '<div class="markdown-body">'
suf = '</div>'
if text.startswith(pre) and text.endswith(suf):
return text
if '```' in text:
# careful input
return text
elif '</div>' in text:
# careful input
return text
else:
# whatever input
lines = text.split("\n")
for i, line in enumerate(lines):
lines[i] = lines[i].replace(" ", "&nbsp;")
text = "</br>".join(lines)
return pre + text + suf
code_highlight_configs_block_mermaid = {
"pymdownx.superfences": {
"css_class": "codehilite",
# "custom_fences": [
# {"name": "mermaid", "class": "mermaid", "format": fence_code_format}
# ],
},
"pymdownx.highlight": {
"css_class": "codehilite",
"guess_lang": True,
# 'auto_title': True,
# 'linenums': True
},
}
def tex2mathml_catch_exception(content, *args, **kwargs):
try:
@@ -71,20 +56,20 @@ def tex2mathml_catch_exception(content, *args, **kwargs):
def replace_math_no_render(match):
content = match.group(1)
if 'mode=display' in match.group(0):
content = content.replace('\n', '</br>')
return f"<font color=\"#00FF00\">$$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$$</font>"
if "mode=display" in match.group(0):
content = content.replace("\n", "</br>")
return f'<font color="#00FF00">$$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$$</font>'
else:
return f"<font color=\"#00FF00\">$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$</font>"
return f'<font color="#00FF00">$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$</font>'
def replace_math_render(match):
content = match.group(1)
if 'mode=display' in match.group(0):
if '\\begin{aligned}' in content:
content = content.replace('\\begin{aligned}', '\\begin{array}')
content = content.replace('\\end{aligned}', '\\end{array}')
content = content.replace('&', ' ')
if "mode=display" in match.group(0):
if "\\begin{aligned}" in content:
content = content.replace("\\begin{aligned}", "\\begin{array}")
content = content.replace("\\end{aligned}", "\\end{array}")
content = content.replace("&", " ")
content = tex2mathml_catch_exception(content, display="block")
return content
else:
@@ -95,9 +80,11 @@ def markdown_bug_hunt(content):
"""
解决一个mdx_math的bug单$包裹begin命令时多余<script>
"""
content = content.replace('<script type="math/tex">\n<script type="math/tex; mode=display">',
'<script type="math/tex; mode=display">')
content = content.replace('</script>\n</script>', '</script>')
content = content.replace(
'<script type="math/tex">\n<script type="math/tex; mode=display">',
'<script type="math/tex; mode=display">',
)
content = content.replace("</script>\n</script>", "</script>")
return content
@@ -105,25 +92,29 @@ def is_equation(txt):
"""
判定是否为公式 | 测试1 写出洛伦兹定律,使用tex格式公式 测试2 给出柯西不等式,使用latex格式 测试3 写出麦克斯韦方程组
"""
if '```' in txt and '```reference' not in txt: return False
if '$' not in txt and '\\[' not in txt: return False
if "```" in txt and "```reference" not in txt:
return False
if "$" not in txt and "\\[" not in txt:
return False
mathpatterns = {
r'(?<!\\|\$)(\$)([^\$]+)(\$)': {'allow_multi_lines': False}, #  $...$
r'(?<!\\)(\$\$)([^\$]+)(\$\$)': {'allow_multi_lines': True}, # $$...$$
r'(?<!\\)(\\\[)(.+?)(\\\])': {'allow_multi_lines': False}, # \[...\]
r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False}, #  $...$
r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True}, # $$...$$
r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False}, # \[...\]
# r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False}, # \(...\)
# r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True}, # \begin...\end
# r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False}, # $`...`$
}
matches = []
for pattern, property in mathpatterns.items():
flags = re.ASCII | re.DOTALL if property['allow_multi_lines'] else re.ASCII
flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII
matches.extend(re.findall(pattern, txt, flags))
if len(matches) == 0: return False
if len(matches) == 0:
return False
contain_any_eq = False
illegal_pattern = re.compile(r'[^\x00-\x7F]|echo')
illegal_pattern = re.compile(r"[^\x00-\x7F]|echo")
for match in matches:
if len(match) != 3: return False
if len(match) != 3:
return False
eq_canidate = match[1]
if illegal_pattern.search(eq_canidate):
return False
@@ -134,27 +125,28 @@ def is_equation(txt):
def fix_markdown_indent(txt):
# fix markdown indent
if (' - ' not in txt) or ('. ' not in txt):
if (" - " not in txt) or (". " not in txt):
# do not need to fix, fast escape
return txt
# walk through the lines and fix non-standard indentation
lines = txt.split("\n")
pattern = re.compile(r'^\s+-')
pattern = re.compile(r"^\s+-")
activated = False
for i, line in enumerate(lines):
if line.startswith('- ') or line.startswith('1. '):
if line.startswith("- ") or line.startswith("1. "):
activated = True
if activated and pattern.match(line):
stripped_string = line.lstrip()
num_spaces = len(line) - len(stripped_string)
if (num_spaces % 4) == 3:
num_spaces_should_be = math.ceil(num_spaces / 4) * 4
lines[i] = ' ' * num_spaces_should_be + stripped_string
return '\n'.join(lines)
lines[i] = " " * num_spaces_should_be + stripped_string
return "\n".join(lines)
FENCED_BLOCK_RE = re.compile(
dedent(r'''
dedent(
r"""
(?P<fence>^[ \t]*(?:~{3,}|`{3,}))[ ]* # opening fence
((\{(?P<attrs>[^\}\n]*)\})| # (optional {attrs} or
(\.?(?P<lang>[\w#.+-]*)[ ]*)? # optional (.)lang
@@ -162,16 +154,17 @@ FENCED_BLOCK_RE = re.compile(
\n # newline (end of opening fence)
(?P<code>.*?)(?<=\n) # the code block
(?P=fence)[ ]*$ # closing fence
'''),
re.MULTILINE | re.DOTALL | re.VERBOSE
"""
),
re.MULTILINE | re.DOTALL | re.VERBOSE,
)
def get_line_range(re_match_obj, txt):
start_pos, end_pos = re_match_obj.regs[0]
num_newlines_before = txt[:start_pos+1].count('\n')
num_newlines_before = txt[: start_pos + 1].count("\n")
line_start = num_newlines_before
line_end = num_newlines_before + txt[start_pos:end_pos].count('\n')+1
line_end = num_newlines_before + txt[start_pos:end_pos].count("\n") + 1
return line_start, line_end
@@ -181,14 +174,16 @@ def fix_code_segment_indent(txt):
txt_tmp = txt
while True:
re_match_obj = FENCED_BLOCK_RE.search(txt_tmp)
if not re_match_obj: break
if len(lines) == 0: lines = txt.split("\n")
if not re_match_obj:
break
if len(lines) == 0:
lines = txt.split("\n")
# 清空 txt_tmp 对应的位置方便下次搜索
start_pos, end_pos = re_match_obj.regs[0]
txt_tmp = txt_tmp[:start_pos] + ' '*(end_pos-start_pos) + txt_tmp[end_pos:]
txt_tmp = txt_tmp[:start_pos] + " " * (end_pos - start_pos) + txt_tmp[end_pos:]
line_start, line_end = get_line_range(re_match_obj, txt)
# 获取公共缩进
shared_indent_cnt = 1e5
for i in range(line_start, line_end):
@@ -202,26 +197,26 @@ def fix_code_segment_indent(txt):
num_spaces_should_be = math.ceil(shared_indent_cnt / 4) * 4
for i in range(line_start, line_end):
add_n = num_spaces_should_be - shared_indent_cnt
lines[i] = ' ' * add_n + lines[i]
if not change_any: # 遇到第一个
lines[i] = " " * add_n + lines[i]
if not change_any: # 遇到第一个
change_any = True
if change_any:
return '\n'.join(lines)
return "\n".join(lines)
else:
return txt
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
def markdown_convertion(txt):
"""
将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。
"""
pre = '<div class="markdown-body">'
suf = '</div>'
suf = "</div>"
if txt.startswith(pre) and txt.endswith(suf):
# print('警告,输入了已经经过转化的字符串,二次转化可能出问题')
return txt # 已经被转化过,不需要再次转化
return txt # 已经被转化过,不需要再次转化
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
@@ -229,18 +224,47 @@ def markdown_convertion(txt):
# txt = fix_code_segment_indent(txt)
if is_equation(txt): # 有$标识的公式符号,且没有代码段```的标识
# convert everything to html format
split = markdown.markdown(text='---')
convert_stage_1 = markdown.markdown(text=txt, extensions=['sane_lists', 'tables', 'mdx_math', 'pymdownx.superfences', 'pymdownx.highlight'],
extension_configs={**markdown_extension_configs, **code_highlight_configs})
split = markdown.markdown(text="---")
convert_stage_1 = markdown.markdown(
text=txt,
extensions=[
"sane_lists",
"tables",
"mdx_math",
"pymdownx.superfences",
"pymdownx.highlight",
],
extension_configs={**markdown_extension_configs, **code_highlight_configs},
)
convert_stage_1 = markdown_bug_hunt(convert_stage_1)
# 1. convert to easy-to-copy tex (do not render math)
convert_stage_2_1, n = re.subn(find_equation_pattern, replace_math_no_render, convert_stage_1, flags=re.DOTALL)
convert_stage_2_1, n = re.subn(
find_equation_pattern,
replace_math_no_render,
convert_stage_1,
flags=re.DOTALL,
)
# 2. convert to rendered equation
convert_stage_2_2, n = re.subn(find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL)
convert_stage_2_2, n = re.subn(
find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
)
# cat them together
return pre + convert_stage_2_1 + f'{split}' + convert_stage_2_2 + suf
return pre + convert_stage_2_1 + f"{split}" + convert_stage_2_2 + suf
else:
return pre + markdown.markdown(txt, extensions=['sane_lists', 'tables', 'pymdownx.superfences', 'pymdownx.highlight'], extension_configs=code_highlight_configs) + suf
return (
pre
+ markdown.markdown(
txt,
extensions=[
"sane_lists",
"tables",
"pymdownx.superfences",
"pymdownx.highlight",
],
extension_configs=code_highlight_configs,
)
+ suf
)
def close_up_code_segment_during_stream(gpt_reply):
@@ -254,20 +278,67 @@ def close_up_code_segment_during_stream(gpt_reply):
str: 返回一个新的字符串,将输出代码片段的“后面的```”补上。
"""
if '```' not in gpt_reply:
if "```" not in gpt_reply:
return gpt_reply
if gpt_reply.endswith('```'):
if gpt_reply.endswith("```"):
return gpt_reply
# 排除了以上两个情况,我们
segments = gpt_reply.split('```')
segments = gpt_reply.split("```")
n_mark = len(segments) - 1
if n_mark % 2 == 1:
return gpt_reply + '\n```' # 输出代码片段中!
return gpt_reply + "\n```" # 输出代码片段中!
else:
return gpt_reply
def special_render_issues_for_mermaid(text):
# 用不太优雅的方式处理一个core_functional.py中出现的mermaid渲染特例
# 我不希望"总结绘制脑图"prompt中的mermaid渲染出来
@lru_cache(maxsize=1)
def get_special_case():
from core_functional import get_core_functions
special_case = get_core_functions()["总结绘制脑图"]["Suffix"]
return special_case
if text.endswith(get_special_case()): text = text.replace("```mermaid", "```")
return text
def compat_non_markdown_input(text):
"""
改善非markdown输入的显示效果,例如将空格转换为&nbsp;,将换行符转换为</br>等。
"""
if "```" in text:
# careful inputmarkdown输入
text = special_render_issues_for_mermaid(text) # 处理特殊的渲染问题
return text
elif "</div>" in text:
# careful inputhtml输入
return text
else:
# whatever input非markdown输入
lines = text.split("\n")
for i, line in enumerate(lines):
lines[i] = lines[i].replace(" ", "&nbsp;") # 空格转换为&nbsp;
text = "</br>".join(lines) # 换行符转换为</br>
return text
@lru_cache(maxsize=128) # 使用lru缓存
def simple_markdown_convertion(text):
pre = '<div class="markdown-body">'
suf = "</div>"
if text.startswith(pre) and text.endswith(suf):
return text # 已经被转化过,不需要再次转化
text = compat_non_markdown_input(text) # 兼容非markdown输入
text = markdown.markdown(
text,
extensions=["pymdownx.superfences", "tables", "pymdownx.highlight"],
extension_configs=code_highlight_configs,
)
return pre + text + suf
def format_io(self, y):
"""
将输入和输出解析为HTML格式。将y中最后一项的输入部分段落化,并将输出部分的Markdown和数学公式转换为HTML格式。
@@ -275,13 +346,16 @@ def format_io(self, y):
if y is None or y == []:
return []
i_ask, gpt_reply = y[-1]
# 输入部分太自由,预处理一波
if i_ask is not None: i_ask = text_divide_paragraph(i_ask)
i_ask = apply_gpt_academic_string_mask(i_ask, mode="show_render")
gpt_reply = apply_gpt_academic_string_mask(gpt_reply, mode="show_render")
# 当代码输出半截的时候,试着补上后个```
if gpt_reply is not None: gpt_reply = close_up_code_segment_during_stream(gpt_reply)
# process
if gpt_reply is not None:
gpt_reply = close_up_code_segment_during_stream(gpt_reply)
# 处理提问与输出
y[-1] = (
None if i_ask is None else markdown.markdown(i_ask, extensions=['pymdownx.superfences', 'tables', 'pymdownx.highlight'], extension_configs=code_highlight_configs),
None if gpt_reply is None else markdown_convertion(gpt_reply)
# 输入部分
None if i_ask is None else simple_markdown_convertion(i_ask),
# 输出部分
None if gpt_reply is None else markdown_convertion(gpt_reply),
)
return y

查看文件

@@ -52,7 +52,7 @@ def get_plugin_default_kwargs():
}
chatbot = ChatBotWithCookies(llm_kwargs)
# txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port
# txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request
DEFAULT_FN_GROUPS_kwargs = {
"main_input": "./README.md",
"llm_kwargs": llm_kwargs,
@@ -60,7 +60,7 @@ def get_plugin_default_kwargs():
"chatbot_with_cookie": chatbot,
"history": [],
"system_prompt": "You are a good AI.",
"web_port": None,
"user_request": None,
}
return DEFAULT_FN_GROUPS_kwargs

查看文件

@@ -0,0 +1,137 @@
import importlib
import time
import inspect
import re
import os
import base64
import gradio
import shutil
import glob
from shared_utils.config_loader import get_conf
def html_local_file(file):
base_path = os.path.dirname(__file__) # 项目目录
if os.path.exists(str(file)):
file = f'file={file.replace(base_path, ".")}'
return file
def html_local_img(__file, layout="left", max_width=None, max_height=None, md=True):
style = ""
if max_width is not None:
style += f"max-width: {max_width};"
if max_height is not None:
style += f"max-height: {max_height};"
__file = html_local_file(__file)
a = f'<div align="{layout}"><img src="{__file}" style="{style}"></div>'
if md:
a = f"![{__file}]({__file})"
return a
def file_manifest_filter_type(file_list, filter_: list = None):
new_list = []
if not filter_:
filter_ = ["png", "jpg", "jpeg"]
for file in file_list:
if str(os.path.basename(file)).split(".")[-1] in filter_:
new_list.append(html_local_img(file, md=False))
else:
new_list.append(file)
return new_list
def zip_extract_member_new(self, member, targetpath, pwd):
# 修复中文乱码的问题
"""Extract the ZipInfo object 'member' to a physical
file on the path targetpath.
"""
import zipfile
if not isinstance(member, zipfile.ZipInfo):
member = self.getinfo(member)
# build the destination pathname, replacing
# forward slashes to platform specific separators.
arcname = member.filename.replace('/', os.path.sep)
arcname = arcname.encode('cp437', errors='replace').decode('gbk', errors='replace')
if os.path.altsep:
arcname = arcname.replace(os.path.altsep, os.path.sep)
# interpret absolute pathname as relative, remove drive letter or
# UNC path, redundant separators, "." and ".." components.
arcname = os.path.splitdrive(arcname)[1]
invalid_path_parts = ('', os.path.curdir, os.path.pardir)
arcname = os.path.sep.join(x for x in arcname.split(os.path.sep)
if x not in invalid_path_parts)
if os.path.sep == '\\':
# filter illegal characters on Windows
arcname = self._sanitize_windows_name(arcname, os.path.sep)
targetpath = os.path.join(targetpath, arcname)
targetpath = os.path.normpath(targetpath)
# Create all upper directories if necessary.
upperdirs = os.path.dirname(targetpath)
if upperdirs and not os.path.exists(upperdirs):
os.makedirs(upperdirs)
if member.is_dir():
if not os.path.isdir(targetpath):
os.mkdir(targetpath)
return targetpath
with self.open(member, pwd=pwd) as source, \
open(targetpath, "wb") as target:
shutil.copyfileobj(source, target)
return targetpath
def extract_archive(file_path, dest_dir):
import zipfile
import tarfile
import os
# Get the file extension of the input file
file_extension = os.path.splitext(file_path)[1]
# Extract the archive based on its extension
if file_extension == ".zip":
with zipfile.ZipFile(file_path, "r") as zipobj:
zipobj._extract_member = lambda a,b,c: zip_extract_member_new(zipobj, a,b,c) # 修复中文乱码的问题
zipobj.extractall(path=dest_dir)
print("Successfully extracted zip archive to {}".format(dest_dir))
elif file_extension in [".tar", ".gz", ".bz2"]:
with tarfile.open(file_path, "r:*") as tarobj:
tarobj.extractall(path=dest_dir)
print("Successfully extracted tar archive to {}".format(dest_dir))
# 第三方库,需要预先pip install rarfile
# 此外,Windows上还需要安装winrar软件,配置其Path环境变量,如"C:\Program Files\WinRAR"才可以
elif file_extension == ".rar":
try:
import rarfile
with rarfile.RarFile(file_path) as rf:
rf.extractall(path=dest_dir)
print("Successfully extracted rar archive to {}".format(dest_dir))
except:
print("Rar format requires additional dependencies to install")
return "\n\n解压失败! 需要安装pip install rarfile来解压rar文件。建议使用zip压缩格式。"
# 第三方库,需要预先pip install py7zr
elif file_extension == ".7z":
try:
import py7zr
with py7zr.SevenZipFile(file_path, mode="r") as f:
f.extractall(path=dest_dir)
print("Successfully extracted 7z archive to {}".format(dest_dir))
except:
print("7z format requires additional dependencies to install")
return "\n\n解压失败! 需要安装pip install py7zr来解压7z文件"
else:
return ""
return ""

查看文件

@@ -14,7 +14,7 @@ def is_openai_api_key(key):
if len(CUSTOM_API_KEY_PATTERN) != 0:
API_MATCH_ORIGINAL = re.match(CUSTOM_API_KEY_PATTERN, key)
else:
API_MATCH_ORIGINAL = re.match(r"sk-[a-zA-Z0-9]{48}$", key)
API_MATCH_ORIGINAL = re.match(r"sk-[a-zA-Z0-9]{48}$|sess-[a-zA-Z0-9]{40}$", key)
return bool(API_MATCH_ORIGINAL)

107
shared_utils/text_mask.py 普通文件
查看文件

@@ -0,0 +1,107 @@
import re
from functools import lru_cache
# 这段代码是使用Python编程语言中的re模块,即正则表达式库,来定义了一个正则表达式模式。
# 这个模式被编译成一个正则表达式对象,存储在名为const_extract_exp的变量中,以便于后续快速的匹配和查找操作。
# 这里解释一下正则表达式中的几个特殊字符:
# - . 表示任意单一字符。
# - * 表示前一个字符可以出现0次或多次。
# - ? 在这里用作非贪婪匹配,也就是说它会匹配尽可能少的字符。在(.*?)中,它确保我们匹配的任意文本是尽可能短的,也就是说,它会在</show_llm>和</show_render>标签之前停止匹配。
# - () 括号在正则表达式中表示捕获组。
# - 在这个例子中,(.*?)表示捕获任意长度的文本,直到遇到括号外部最近的限定符,即</show_llm>和</show_render>。
# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
const_extract_re = re.compile(
r"<gpt_academic_string_mask><show_llm>(.*?)</show_llm><show_render>(.*?)</show_render></gpt_academic_string_mask>"
)
# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
const_extract_langbased_re = re.compile(
r"<gpt_academic_string_mask><lang_english>(.*?)</lang_english><lang_chinese>(.*?)</lang_chinese></gpt_academic_string_mask>",
flags=re.DOTALL,
)
@lru_cache(maxsize=128)
def apply_gpt_academic_string_mask(string, mode="show_all"):
"""
当字符串中有掩码tag时<gpt_academic_string_mask><show_...>,根据字符串要给谁看大模型,还是web渲染,对字符串进行处理,返回处理后的字符串
示意图https://mermaid.live/edit#pako:eNqlkUtLw0AUhf9KuOta0iaTplkIPlpduFJwoZEwJGNbzItpita2O6tF8QGKogXFtwu7cSHiq3-mk_oznFR8IYLgrGbuOd9hDrcCpmcR0GDW9ubNPKaBMDauuwI_A9M6YN-3y0bODwxsYos4BdMoBrTg5gwHF-d0mBH6-vqFQe58ed5m9XPW2uteX3Tubrj0ljLYcwxxR3h1zB43WeMs3G19yEM9uapDMe_NG9i2dagKw1Fee4c1D9nGEbtc-5n6HbNtJ8IyHOs8tbs7V2HrlDX2w2Y7XD_5haHEtQiNsOwfMVa_7TzsvrWIuJGo02qTrdwLk9gukQylHv3Afv1ML270s-HZUndrmW1tdA-WfvbM_jMFYuAQ6uCCxVdciTJ1CPLEITpo_GphypeouzXuw6XAmyi7JmgBLZEYlHwLB2S4gHMUO-9DH7tTnvf1CVoFFkBLSOk4QmlRTqpIlaWUHINyNFXjaQWpCYRURUKiWovBYo8X4ymEJFlECQUpqaQkJmuvWygPpg
"""
if "<gpt_academic_string_mask>" not in string: # No need to process
return string
if mode == "show_all":
return string
if mode == "show_llm":
string = const_extract_re.sub(r"\1", string)
elif mode == "show_render":
string = const_extract_re.sub(r"\2", string)
else:
raise ValueError("Invalid mode")
return string
@lru_cache(maxsize=128)
def build_gpt_academic_masked_string(text_show_llm="", text_show_render=""):
"""
根据字符串要给谁看大模型,还是web渲染,生成带掩码tag的字符串
"""
return f"<gpt_academic_string_mask><show_llm>{text_show_llm}</show_llm><show_render>{text_show_render}</show_render></gpt_academic_string_mask>"
@lru_cache(maxsize=128)
def apply_gpt_academic_string_mask_langbased(string, lang_reference):
"""
当字符串中有掩码tag时<gpt_academic_string_mask><lang_...>),根据语言,选择提示词,对字符串进行处理,返回处理后的字符串
例如,如果lang_reference是英文,那么就只显示英文提示词,中文提示词就不显示了
举例:
输入1
string = "注意,lang_reference这段文字是<gpt_academic_string_mask><lang_english>英语</lang_english><lang_chinese>中文</lang_chinese></gpt_academic_string_mask>"
lang_reference = "hello world"
输出1
"注意,lang_reference这段文字是英语"
输入2
string = "注意,lang_reference这段文字是中文" # 注意这里没有掩码tag,所以不会被处理
lang_reference = "hello world"
输出2
"注意,lang_reference这段文字是中文" # 原样返回
"""
if "<gpt_academic_string_mask>" not in string: # No need to process
return string
def contains_chinese(string):
chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
return chinese_regex.search(string) is not None
mode = "english" if not contains_chinese(lang_reference) else "chinese"
if mode == "english":
string = const_extract_langbased_re.sub(r"\1", string)
elif mode == "chinese":
string = const_extract_langbased_re.sub(r"\2", string)
else:
raise ValueError("Invalid mode")
return string
@lru_cache(maxsize=128)
def build_gpt_academic_masked_string_langbased(text_show_english="", text_show_chinese=""):
"""
根据语言,选择提示词,对字符串进行处理,返回处理后的字符串
"""
return f"<gpt_academic_string_mask><lang_english>{text_show_english}</lang_english><lang_chinese>{text_show_chinese}</lang_chinese></gpt_academic_string_mask>"
if __name__ == "__main__":
# Test
input_string = (
"你好\n"
+ build_gpt_academic_masked_string(text_show_llm="mermaid", text_show_render="")
+ "你好\n"
)
print(
apply_gpt_academic_string_mask(input_string, "show_llm")
) # Should print the strings with 'abc' in place of the academic mask tags
print(
apply_gpt_academic_string_mask(input_string, "show_render")
) # Should print the strings with 'xyz' in place of the academic mask tags