gpt_academic text mask imp

2025-12-06 06:26:47 +00:00 · 2024-01-20 18:00:06 +08:00
--- a/shared_utils/advanced_markdown_format.py
+++ b/shared_utils/advanced_markdown_format.py
@@ -4,52 +4,47 @@ import os
 import math
 from textwrap import dedent
 from functools import lru_cache
-from pymdownx.superfences import fence_div_format, fence_code_format
+from pymdownx.superfences import fence_code_format
 from latex2mathml.converter import convert as tex2mathml
 from shared_utils.config_loader import get_conf as get_conf
-
-pj = os.path.join
-default_user_name = 'default_user'
+from shared_utils.text_mask import apply_gpt_academic_string_mask

 markdown_extension_configs = {
-    'mdx_math': {
-        'enable_dollar_delimiter': True,
-        'use_gitlab_delimiters': False,
+    "mdx_math": {
+        "enable_dollar_delimiter": True,
+        "use_gitlab_delimiters": False,
    },
 }

 code_highlight_configs = {
    "pymdownx.superfences": {
-        'css_class': 'codehilite',
+        "css_class": "codehilite",
        "custom_fences": [
-            {
-                'name': 'mermaid',
-                'class': 'mermaid',
-                'format': fence_code_format
-            }
-        ]
+            {"name": "mermaid", "class": "mermaid", "format": fence_code_format}
+        ],
    },
    "pymdownx.highlight": {
-        'css_class': 'codehilite',
-        'guess_lang': True,
+        "css_class": "codehilite",
+        "guess_lang": True,
        # 'auto_title': True,
        # 'linenums': True
-    }
+    },
 }

+
 def text_divide_paragraph(text):
    """
    将文本按照段落分隔符分割开，生成带有段落标签的HTML代码。
    """
    pre = '<div class="markdown-body">'
-    suf = '</div>'
+    suf = "</div>"
    if text.startswith(pre) and text.endswith(suf):
        return text

-    if '```' in text:
+    if "```" in text:
        # careful input
        return text
-    elif '</div>' in text:
+    elif "</div>" in text:
        # careful input
        return text
    else:
@@ -71,20 +66,20 @@ def tex2mathml_catch_exception(content, *args, **kwargs):

 def replace_math_no_render(match):
    content = match.group(1)
-    if 'mode=display' in match.group(0):
-        content = content.replace('\n', '</br>')
-        return f"<font color=\"#00FF00\">$$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$$</font>"
+    if "mode=display" in match.group(0):
+        content = content.replace("\n", "</br>")
+        return f'<font color="#00FF00">$$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$$</font>'
    else:
-        return f"<font color=\"#00FF00\">$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$</font>"
+        return f'<font color="#00FF00">$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$</font>'


 def replace_math_render(match):
    content = match.group(1)
-    if 'mode=display' in match.group(0):
-        if '\\begin{aligned}' in content:
-            content = content.replace('\\begin{aligned}', '\\begin{array}')
-            content = content.replace('\\end{aligned}', '\\end{array}')
-            content = content.replace('&', ' ')
+    if "mode=display" in match.group(0):
+        if "\\begin{aligned}" in content:
+            content = content.replace("\\begin{aligned}", "\\begin{array}")
+            content = content.replace("\\end{aligned}", "\\end{array}")
+            content = content.replace("&", " ")
        content = tex2mathml_catch_exception(content, display="block")
        return content
    else:
@@ -95,9 +90,11 @@ def markdown_bug_hunt(content):
    """
    解决一个mdx_math的bug（单$包裹begin命令时多余<script>）
    """
-    content = content.replace('<script type="math/tex">\n<script type="math/tex; mode=display">',
-                                '<script type="math/tex; mode=display">')
-    content = content.replace('</script>\n</script>', '</script>')
+    content = content.replace(
+        '<script type="math/tex">\n<script type="math/tex; mode=display">',
+        '<script type="math/tex; mode=display">',
+    )
+    content = content.replace("</script>\n</script>", "</script>")
    return content


@@ -105,25 +102,29 @@ def is_equation(txt):
    """
    判定是否为公式 | 测试1 写出洛伦兹定律，使用tex格式公式 测试2 给出柯西不等式，使用latex格式 测试3 写出麦克斯韦方程组
    """
-    if '```' in txt and '```reference' not in txt: return False
-    if '$' not in txt and '\\[' not in txt: return False
+    if "```" in txt and "```reference" not in txt:
+        return False
+    if "$" not in txt and "\\[" not in txt:
+        return False
    mathpatterns = {
-        r'(?<!\\|\$)(\$)([^\$]+)(\$)': {'allow_multi_lines': False},                       #  $...$
-        r'(?<!\\)(\$\$)([^\$]+)(\$\$)': {'allow_multi_lines': True},                       # $$...$$
-        r'(?<!\\)(\\\[)(.+?)(\\\])': {'allow_multi_lines': False},                         # \[...\]
+        r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False},  #  $...$
+        r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True},  # $$...$$
+        r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False},  # \[...\]
        # r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False},                       # \(...\)
        # r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True},  # \begin...\end
        # r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False},                       # $`...`$
    }
    matches = []
    for pattern, property in mathpatterns.items():
-        flags = re.ASCII | re.DOTALL if property['allow_multi_lines'] else re.ASCII
+        flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII
        matches.extend(re.findall(pattern, txt, flags))
-    if len(matches) == 0: return False
+    if len(matches) == 0:
+        return False
    contain_any_eq = False
-    illegal_pattern = re.compile(r'[^\x00-\x7F]|echo')
+    illegal_pattern = re.compile(r"[^\x00-\x7F]|echo")
    for match in matches:
-        if len(match) != 3: return False
+        if len(match) != 3:
+            return False
        eq_canidate = match[1]
        if illegal_pattern.search(eq_canidate):
            return False
@@ -134,27 +135,28 @@ def is_equation(txt):

 def fix_markdown_indent(txt):
    # fix markdown indent
-    if (' - ' not in txt) or ('. ' not in txt):
+    if (" - " not in txt) or (". " not in txt):
        # do not need to fix, fast escape
        return txt
    # walk through the lines and fix non-standard indentation
    lines = txt.split("\n")
-    pattern = re.compile(r'^\s+-')
+    pattern = re.compile(r"^\s+-")
    activated = False
    for i, line in enumerate(lines):
-        if line.startswith('- ') or line.startswith('1. '):
+        if line.startswith("- ") or line.startswith("1. "):
            activated = True
        if activated and pattern.match(line):
            stripped_string = line.lstrip()
            num_spaces = len(line) - len(stripped_string)
            if (num_spaces % 4) == 3:
                num_spaces_should_be = math.ceil(num_spaces / 4) * 4
-                lines[i] = ' ' * num_spaces_should_be + stripped_string
-    return '\n'.join(lines)
+                lines[i] = " " * num_spaces_should_be + stripped_string
+    return "\n".join(lines)


 FENCED_BLOCK_RE = re.compile(
-    dedent(r'''
+    dedent(
+        r"""
        (?P<fence>^[ \t]*(?:~{3,}|`{3,}))[ ]*                      # opening fence
        ((\{(?P<attrs>[^\}\n]*)\})|                              # (optional {attrs} or
        (\.?(?P<lang>[\w#.+-]*)[ ]*)?                            # optional (.)lang
@@ -162,16 +164,17 @@ FENCED_BLOCK_RE = re.compile(
        \n                                                       # newline (end of opening fence)
        (?P<code>.*?)(?<=\n)                                     # the code block
        (?P=fence)[ ]*$                                          # closing fence
-    '''),
-    re.MULTILINE | re.DOTALL | re.VERBOSE
+    """
+    ),
+    re.MULTILINE | re.DOTALL | re.VERBOSE,
 )


 def get_line_range(re_match_obj, txt):
    start_pos, end_pos = re_match_obj.regs[0]
-    num_newlines_before = txt[:start_pos+1].count('\n')
+    num_newlines_before = txt[: start_pos + 1].count("\n")
    line_start = num_newlines_before
-    line_end = num_newlines_before + txt[start_pos:end_pos].count('\n')+1
+    line_end = num_newlines_before + txt[start_pos:end_pos].count("\n") + 1
    return line_start, line_end


@@ -181,14 +184,16 @@ def fix_code_segment_indent(txt):
    txt_tmp = txt
    while True:
        re_match_obj = FENCED_BLOCK_RE.search(txt_tmp)
-        if not re_match_obj: break
-        if len(lines) == 0: lines = txt.split("\n")
-        
+        if not re_match_obj:
+            break
+        if len(lines) == 0:
+            lines = txt.split("\n")
+
        # 清空 txt_tmp 对应的位置方便下次搜索
        start_pos, end_pos = re_match_obj.regs[0]
-        txt_tmp = txt_tmp[:start_pos] + ' '*(end_pos-start_pos) + txt_tmp[end_pos:]
+        txt_tmp = txt_tmp[:start_pos] + " " * (end_pos - start_pos) + txt_tmp[end_pos:]
        line_start, line_end = get_line_range(re_match_obj, txt)
-        
+
        # 获取公共缩进
        shared_indent_cnt = 1e5
        for i in range(line_start, line_end):
@@ -202,26 +207,26 @@ def fix_code_segment_indent(txt):
            num_spaces_should_be = math.ceil(shared_indent_cnt / 4) * 4
            for i in range(line_start, line_end):
                add_n = num_spaces_should_be - shared_indent_cnt
-                lines[i] = ' ' * add_n + lines[i]
-            if not change_any: # 遇到第一个
+                lines[i] = " " * add_n + lines[i]
+            if not change_any:  # 遇到第一个
                change_any = True

    if change_any:
-        return '\n'.join(lines)
+        return "\n".join(lines)
    else:
        return txt
-    
-    
-@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
+
+
+@lru_cache(maxsize=128)  # 使用 lru缓存 加快转换速度
 def markdown_convertion(txt):
    """
    将Markdown格式的文本转换为HTML格式。如果包含数学公式，则先将公式转换为HTML格式。
    """
    pre = '<div class="markdown-body">'
-    suf = '</div>'
+    suf = "</div>"
    if txt.startswith(pre) and txt.endswith(suf):
        # print('警告，输入了已经经过转化的字符串，二次转化可能出问题')
-        return txt # 已经被转化过，不需要再次转化
+        return txt  # 已经被转化过，不需要再次转化

    find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'

@@ -229,18 +234,47 @@ def markdown_convertion(txt):
    # txt = fix_code_segment_indent(txt)
    if is_equation(txt):  # 有$标识的公式符号，且没有代码段```的标识
        # convert everything to html format
-        split = markdown.markdown(text='---')
-        convert_stage_1 = markdown.markdown(text=txt, extensions=['sane_lists', 'tables', 'mdx_math', 'pymdownx.superfences', 'pymdownx.highlight'],
-                                            extension_configs={**markdown_extension_configs, **code_highlight_configs})
+        split = markdown.markdown(text="---")
+        convert_stage_1 = markdown.markdown(
+            text=txt,
+            extensions=[
+                "sane_lists",
+                "tables",
+                "mdx_math",
+                "pymdownx.superfences",
+                "pymdownx.highlight",
+            ],
+            extension_configs={**markdown_extension_configs, **code_highlight_configs},
+        )
        convert_stage_1 = markdown_bug_hunt(convert_stage_1)
        # 1. convert to easy-to-copy tex (do not render math)
-        convert_stage_2_1, n = re.subn(find_equation_pattern, replace_math_no_render, convert_stage_1, flags=re.DOTALL)
+        convert_stage_2_1, n = re.subn(
+            find_equation_pattern,
+            replace_math_no_render,
+            convert_stage_1,
+            flags=re.DOTALL,
+        )
        # 2. convert to rendered equation
-        convert_stage_2_2, n = re.subn(find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL)
+        convert_stage_2_2, n = re.subn(
+            find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
+        )
        # cat them together
-        return pre + convert_stage_2_1 + f'{split}' + convert_stage_2_2 + suf
+        return pre + convert_stage_2_1 + f"{split}" + convert_stage_2_2 + suf
    else:
-        return pre + markdown.markdown(txt, extensions=['sane_lists', 'tables', 'pymdownx.superfences', 'pymdownx.highlight'], extension_configs=code_highlight_configs) + suf
+        return (
+            pre
+            + markdown.markdown(
+                txt,
+                extensions=[
+                    "sane_lists",
+                    "tables",
+                    "pymdownx.superfences",
+                    "pymdownx.highlight",
+                ],
+                extension_configs=code_highlight_configs,
+            )
+            + suf
+        )


 def close_up_code_segment_during_stream(gpt_reply):
@@ -254,16 +288,16 @@ def close_up_code_segment_during_stream(gpt_reply):
        str: 返回一个新的字符串，将输出代码片段的“后面的```”补上。

    """
-    if '```' not in gpt_reply:
+    if "```" not in gpt_reply:
        return gpt_reply
-    if gpt_reply.endswith('```'):
+    if gpt_reply.endswith("```"):
        return gpt_reply

    # 排除了以上两个情况，我们
-    segments = gpt_reply.split('```')
+    segments = gpt_reply.split("```")
    n_mark = len(segments) - 1
    if n_mark % 2 == 1:
-        return gpt_reply + '\n```' # 输出代码片段中！
+        return gpt_reply + "\n```"  # 输出代码片段中！
    else:
        return gpt_reply

@@ -275,13 +309,23 @@ def format_io(self, y):
    if y is None or y == []:
        return []
    i_ask, gpt_reply = y[-1]
+    i_ask = apply_gpt_academic_string_mask(i_ask, mode="show_render")
+    gpt_reply = apply_gpt_academic_string_mask(gpt_reply, mode="show_render")
    # 输入部分太自由，预处理一波
-    if i_ask is not None: i_ask = text_divide_paragraph(i_ask)
+    if i_ask is not None:
+        i_ask = text_divide_paragraph(i_ask)
    # 当代码输出半截的时候，试着补上后个```
-    if gpt_reply is not None: gpt_reply = close_up_code_segment_during_stream(gpt_reply)
-    # process
+    if gpt_reply is not None:
+        gpt_reply = close_up_code_segment_during_stream(gpt_reply)
+    # 处理提问与输出
    y[-1] = (
-        None if i_ask is None else markdown.markdown(i_ask, extensions=['pymdownx.superfences', 'tables', 'pymdownx.highlight'], extension_configs=code_highlight_configs),
-        None if gpt_reply is None else markdown_convertion(gpt_reply)
+        None
+        if i_ask is None
+        else markdown.markdown(
+            i_ask,
+            extensions=["pymdownx.superfences", "tables", "pymdownx.highlight"],
+            extension_configs=code_highlight_configs,
+        ),
+        None if gpt_reply is None else markdown_convertion(gpt_reply),
    )
    return y
--- a/shared_utils/text_mask.py
+++ b/shared_utils/text_mask.py
@@ -0,0 +1,56 @@
+import re
+from functools import lru_cache
+
+# 这段代码是使用Python编程语言中的re模块，即正则表达式库，来定义了一个正则表达式模式。
+# 这个模式被编译成一个正则表达式对象，存储在名为const_extract_exp的变量中，以便于后续快速的匹配和查找操作。
+# 这里解释一下正则表达式中的几个特殊字符：
+# - . 表示任意单一字符。
+# - * 表示前一个字符可以出现0次或多次。
+# - ? 在这里用作非贪婪匹配，也就是说它会匹配尽可能少的字符。在(.*?)中，它确保我们匹配的任意文本是尽可能短的，也就是说，它会在</show_llm>和</show_render>标签之前停止匹配。
+# - () 括号在正则表达式中表示捕获组。
+# - 在这个例子中，(.*?)表示捕获任意长度的文本，直到遇到括号外部最近的限定符，即</show_llm>和</show_render>。
+
+# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+const_extract_re = re.compile(
+    r"<gpt_academic_string_mask><show_llm>(.*?)</show_llm><show_render>(.*?)</show_render></gpt_academic_string_mask>"
+)
+
+
+@lru_cache(maxsize=128)
+def apply_gpt_academic_string_mask(string, mode="show_all"):
+    """
+    根据字符串要给谁看（大模型，还是web渲染），对字符串进行处理，返回处理后的字符串
+    示意图：https://mermaid.live/edit#pako:eNqlkUtLw0AUhf9KuOta0iaTplkIPlpduFJwoZEwJGNbzItpita2O6tF8QGKogXFtwu7cSHiq3-mk_oznFR8IYLgrGbuOd9hDrcCpmcR0GDW9ubNPKaBMDauuwI_A9M6YN-3y0bODwxsYos4BdMoBrTg5gwHF-d0mBH6-vqFQe58ed5m9XPW2uteX3Tubrj0ljLYcwxxR3h1zB43WeMs3G19yEM9uapDMe_NG9i2dagKw1Fee4c1D9nGEbtc-5n6HbNtJ8IyHOs8tbs7V2HrlDX2w2Y7XD_5haHEtQiNsOwfMVa_7TzsvrWIuJGo02qTrdwLk9gukQylHv3Afv1ML270s-HZUndrmW1tdA-WfvbM_jMFYuAQ6uCCxVdciTJ1CPLEITpo_GphypeouzXuw6XAmyi7JmgBLZEYlHwLB2S4gHMUO-9DH7tTnvf1CVoFFkBLSOk4QmlRTqpIlaWUHINyNFXjaQWpCYRURUKiWovBYo8X4ymEJFlECQUpqaQkJmuvWygPpg
+    """
+    if mode == "show_all":
+        return string
+    if mode == "show_llm":
+        string = const_extract_re.sub(r"\1", string)
+    elif mode == "show_render":
+        string = const_extract_re.sub(r"\2", string)
+    else:
+        raise ValueError("Invalid mode")
+    return string
+
+
+@lru_cache(maxsize=128)
+def build_gpt_academic_masked_string(text_show_llm="", text_show_render=""):
+    """
+    根据字符串要给谁看（大模型，还是web渲染），生成带掩码tag的字符串
+    """
+    return f"<gpt_academic_string_mask><show_llm>{text_show_llm}</show_llm><show_render>{text_show_render}</show_render></gpt_academic_string_mask>"
+
+
+if __name__ == "__main__":
+    # Test
+    input_string = (
+        "你好\n"
+        + build_gpt_academic_masked_string(text_show_llm="mermaid", text_show_render="")
+        + "你好\n"
+    )
+    print(
+        apply_gpt_academic_string_mask(input_string, "show_llm")
+    )  # Should print the strings with 'abc' in place of the academic mask tags
+    print(
+        apply_gpt_academic_string_mask(input_string, "show_render")
+    )  # Should print the strings with 'xyz' in place of the academic mask tags