镜像自地址
https://github.com/binary-husky/gpt_academic.git
已同步 2025-12-06 22:46:48 +00:00
fix code highlight problem
这个提交包含在:
@@ -10,6 +10,26 @@ from shared_utils.config_loader import get_conf as get_conf
|
||||
pj = os.path.join
|
||||
default_user_name = 'default_user'
|
||||
|
||||
markdown_extension_configs = {
|
||||
'mdx_math': {
|
||||
'enable_dollar_delimiter': True,
|
||||
'use_gitlab_delimiters': False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
code_highlight_configs = {
|
||||
"pymdownx.superfences": {
|
||||
'css_class': 'codehilite',
|
||||
},
|
||||
"pymdownx.highlight": {
|
||||
'css_class': 'codehilite',
|
||||
'guess_lang': True,
|
||||
# 'auto_title': True,
|
||||
# 'linenums': True
|
||||
}
|
||||
}
|
||||
|
||||
def text_divide_paragraph(text):
|
||||
"""
|
||||
将文本按照段落分隔符分割开,生成带有段落标签的HTML代码。
|
||||
@@ -33,6 +53,7 @@ def text_divide_paragraph(text):
|
||||
text = "</br>".join(lines)
|
||||
return pre + text + suf
|
||||
|
||||
|
||||
def tex2mathml_catch_exception(content, *args, **kwargs):
|
||||
try:
|
||||
content = tex2mathml(content, *args, **kwargs)
|
||||
@@ -40,6 +61,7 @@ def tex2mathml_catch_exception(content, *args, **kwargs):
|
||||
content = content
|
||||
return content
|
||||
|
||||
|
||||
def replace_math_no_render(match):
|
||||
content = match.group(1)
|
||||
if 'mode=display' in match.group(0):
|
||||
@@ -48,6 +70,7 @@ def replace_math_no_render(match):
|
||||
else:
|
||||
return f"<font color=\"#00FF00\">$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$</font>"
|
||||
|
||||
|
||||
def replace_math_render(match):
|
||||
content = match.group(1)
|
||||
if 'mode=display' in match.group(0):
|
||||
@@ -60,6 +83,7 @@ def replace_math_render(match):
|
||||
else:
|
||||
return tex2mathml_catch_exception(content)
|
||||
|
||||
|
||||
def markdown_bug_hunt(content):
|
||||
"""
|
||||
解决一个mdx_math的bug(单$包裹begin命令时多余<script>)
|
||||
@@ -69,6 +93,7 @@ def markdown_bug_hunt(content):
|
||||
content = content.replace('</script>\n</script>', '</script>')
|
||||
return content
|
||||
|
||||
|
||||
def is_equation(txt):
|
||||
"""
|
||||
判定是否为公式 | 测试1 写出洛伦兹定律,使用tex格式公式 测试2 给出柯西不等式,使用latex格式 测试3 写出麦克斯韦方程组
|
||||
@@ -99,6 +124,7 @@ def is_equation(txt):
|
||||
contain_any_eq = True
|
||||
return contain_any_eq
|
||||
|
||||
|
||||
def fix_markdown_indent(txt):
|
||||
# fix markdown indent
|
||||
if (' - ' not in txt) or ('. ' not in txt):
|
||||
@@ -119,6 +145,7 @@ def fix_markdown_indent(txt):
|
||||
lines[i] = ' ' * num_spaces_should_be + stripped_string
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
FENCED_BLOCK_RE = re.compile(
|
||||
dedent(r'''
|
||||
(?P<fence>^[ \t]*(?:~{3,}|`{3,}))[ ]* # opening fence
|
||||
@@ -132,6 +159,7 @@ FENCED_BLOCK_RE = re.compile(
|
||||
re.MULTILINE | re.DOTALL | re.VERBOSE
|
||||
)
|
||||
|
||||
|
||||
def get_line_range(re_match_obj, txt):
|
||||
start_pos, end_pos = re_match_obj.regs[0]
|
||||
num_newlines_before = txt[:start_pos+1].count('\n')
|
||||
@@ -139,6 +167,7 @@ def get_line_range(re_match_obj, txt):
|
||||
line_end = num_newlines_before + txt[start_pos:end_pos].count('\n')+1
|
||||
return line_start, line_end
|
||||
|
||||
|
||||
def fix_code_segment_indent(txt):
|
||||
lines = []
|
||||
change_any = False
|
||||
@@ -175,6 +204,7 @@ def fix_code_segment_indent(txt):
|
||||
else:
|
||||
return txt
|
||||
|
||||
|
||||
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
|
||||
def markdown_convertion(txt):
|
||||
"""
|
||||
@@ -186,12 +216,6 @@ def markdown_convertion(txt):
|
||||
# print('警告,输入了已经经过转化的字符串,二次转化可能出问题')
|
||||
return txt # 已经被转化过,不需要再次转化
|
||||
|
||||
markdown_extension_configs = {
|
||||
'mdx_math': {
|
||||
'enable_dollar_delimiter': True,
|
||||
'use_gitlab_delimiters': False,
|
||||
},
|
||||
}
|
||||
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
|
||||
|
||||
txt = fix_markdown_indent(txt)
|
||||
@@ -199,8 +223,8 @@ def markdown_convertion(txt):
|
||||
if is_equation(txt): # 有$标识的公式符号,且没有代码段```的标识
|
||||
# convert everything to html format
|
||||
split = markdown.markdown(text='---')
|
||||
convert_stage_1 = markdown.markdown(text=txt, extensions=['sane_lists', 'tables', 'mdx_math', 'pymdownx.superfences'],
|
||||
extension_configs=markdown_extension_configs)
|
||||
convert_stage_1 = markdown.markdown(text=txt, extensions=['sane_lists', 'tables', 'mdx_math', 'pymdownx.superfences', 'pymdownx.highlight'],
|
||||
extension_configs={**markdown_extension_configs, **code_highlight_configs})
|
||||
convert_stage_1 = markdown_bug_hunt(convert_stage_1)
|
||||
# 1. convert to easy-to-copy tex (do not render math)
|
||||
convert_stage_2_1, n = re.subn(find_equation_pattern, replace_math_no_render, convert_stage_1, flags=re.DOTALL)
|
||||
@@ -209,7 +233,7 @@ def markdown_convertion(txt):
|
||||
# cat them together
|
||||
return pre + convert_stage_2_1 + f'{split}' + convert_stage_2_2 + suf
|
||||
else:
|
||||
return pre + markdown.markdown(txt, extensions=['sane_lists', 'tables', 'pymdownx.superfences', 'codehilite']) + suf
|
||||
return pre + markdown.markdown(txt, extensions=['sane_lists', 'tables', 'pymdownx.superfences', 'pymdownx.highlight'], extension_configs=code_highlight_configs) + suf
|
||||
|
||||
|
||||
def close_up_code_segment_during_stream(gpt_reply):
|
||||
@@ -250,7 +274,7 @@ def format_io(self, y):
|
||||
if gpt_reply is not None: gpt_reply = close_up_code_segment_during_stream(gpt_reply)
|
||||
# process
|
||||
y[-1] = (
|
||||
None if i_ask is None else markdown.markdown(i_ask, extensions=['pymdownx.superfences', 'tables']),
|
||||
None if i_ask is None else markdown.markdown(i_ask, extensions=['pymdownx.superfences', 'tables', 'pymdownx.highlight'], extension_configs=code_highlight_configs),
|
||||
None if gpt_reply is None else markdown_convertion(gpt_reply)
|
||||
)
|
||||
return y
|
||||
|
||||
在新工单中引用
屏蔽一个用户