TxT2Docx/markdown_parser.py
2025-09-21 19:01:40 +08:00

538 lines
17 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Markdown解析模块
负责解析Markdown格式的文本提取结构化信息和内联格式。
支持标题、列表、代码块、表格、链接等常见Markdown元素。
"""
import re
from typing import List, Dict, Any
from config import config
class MarkdownParser:
"""Markdown解析器类"""
# Markdown格式匹配模式
PATTERNS = {
'heading': re.compile(r'^(\s*)(#{1,6})\s+(.+)$'),
'bold_asterisk': re.compile(r'\*\*(.+?)\*\*'),
'bold_underscore': re.compile(r'__(.+?)__'),
'italic_asterisk': re.compile(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)'),
'italic_underscore': re.compile(r'_(.+?)_'),
'code_inline': re.compile(r'`([^`]+)`'),
'code_block': re.compile(r'^```(\w+)?\s*\n(.*?)\n```', re.MULTILINE | re.DOTALL),
'strikethrough': re.compile(r'~~(.+?)~~'),
'link': re.compile(r'\[([^\]]+)\]\(([^)]+)\)'),
'image': re.compile(r'!\[([^\]]*)\]\(([^)]+)\)'),
'unordered_list': re.compile(r'^\s*[-*+]\s+(.+)$'),
'ordered_list': re.compile(r'^\s*\d+\.\s+(.+)$'),
'blockquote': re.compile(r'^\s*>\s*(.+)$'),
'horizontal_rule': re.compile(r'^(\s*[-*_]){3,}\s*$'),
'table_row': re.compile(r'^\|(.+)\|$'),
'table_separator': re.compile(r'^\|(\s*:?-+:?\s*\|)+$')
}
@classmethod
def parse(cls, txt_content: str) -> List[Dict[str, Any]]:
"""
解析Markdown内容为结构化数据
Args:
txt_content: Markdown文本内容
Returns:
List[Dict[str, Any]]: 解析后的结构化数据列表
"""
if not txt_content:
return []
elements = cls._parse_elements(txt_content)
sections = cls._group_by_sections(elements)
return sections
@classmethod
def _parse_elements(cls, txt_content: str) -> List[Dict[str, Any]]:
"""
解析Markdown内容为元素列表
Args:
txt_content: Markdown文本内容
Returns:
List[Dict[str, Any]]: 元素列表
"""
elements = []
lines = txt_content.split('\n')
i = 0
in_code_block = False
code_block_content = []
table_mode = False
table_rows = []
code_block_language = ""
while i < len(lines):
line = lines[i].rstrip('\r')
# 处理代码块
if line.strip().startswith('```'):
if not in_code_block:
in_code_block = True
code_block_language = line.strip()[3:].strip()
code_block_content = []
else:
in_code_block = False
elements.append({
'type': 'code_block',
'language': code_block_language,
'content': '\n'.join(code_block_content),
'level': 0
})
code_block_content = []
code_block_language = ""
i += 1
continue
if in_code_block:
code_block_content.append(line)
i += 1
continue
# 处理表格
table_match = cls.PATTERNS['table_row'].match(line)
table_sep_match = cls.PATTERNS['table_separator'].match(line)
if table_match or table_sep_match:
if not table_mode:
table_mode = True
table_rows = []
if table_match and not table_sep_match:
cells = [cell.strip() for cell in table_match.group(1).split('|')]
table_rows.append(cells)
i += 1
continue
elif table_mode:
# 表格结束
if table_rows:
elements.append({
'type': 'table',
'rows': table_rows,
'level': 0
})
table_mode = False
table_rows = []
# 处理标题
heading_match = cls.PATTERNS['heading'].match(line)
if heading_match:
level = len(heading_match.group(2))
if level <= config.title_levels:
heading_text = heading_match.group(3).strip()
# 先移除Markdown标记但保留文本内容
cleaned_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', heading_text)
elements.append({
'type': 'heading',
'level': level,
'content': heading_text, # 保留原始内容用于格式处理
'cleaned_content': cleaned_text # 用于显示的纯文本
})
i += 1
continue
# 处理水平分隔线
if cls.PATTERNS['horizontal_rule'].match(line):
elements.append({
'type': 'horizontal_rule',
'level': 0
})
i += 1
continue
# 处理列表
ul_match = cls.PATTERNS['unordered_list'].match(line)
ol_match = cls.PATTERNS['ordered_list'].match(line)
if ul_match:
elements.append({
'type': 'unordered_list',
'content': ul_match.group(1),
'level': 0
})
i += 1
continue
if ol_match:
elements.append({
'type': 'ordered_list',
'content': ol_match.group(1),
'level': 0
})
i += 1
continue
# 处理引用
quote_match = cls.PATTERNS['blockquote'].match(line)
if quote_match:
elements.append({
'type': 'blockquote',
'content': quote_match.group(1),
'level': 0
})
i += 1
continue
# 处理空行
if line.strip() == '':
elements.append({
'type': 'empty',
'content': '',
'level': 0
})
i += 1
continue
# 处理普通段落
elements.append({
'type': 'paragraph',
'content': line,
'level': 0
})
i += 1
# 处理剩余的表格
if table_mode and table_rows:
elements.append({
'type': 'table',
'rows': table_rows,
'level': 0
})
return elements
@classmethod
def _group_by_sections(cls, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
将解析的元素按标题分组
Args:
elements: 元素列表
Returns:
List[Dict[str, Any]]: 按章节分组的结构
"""
sections = []
current_section = {
'type': 'section',
'level': 0,
'content': '前置内容',
'elements': []
}
for element in elements:
if element['type'] == 'heading':
# 保存当前section
if current_section['elements']:
sections.append(current_section)
# 创建新section
current_section = {
'type': 'section',
'level': element['level'],
'content': element['content'],
'elements': []
}
else:
current_section['elements'].append(element)
# 添加最后一个section
if current_section['elements'] or current_section['content'] != '前置内容':
sections.append(current_section)
return sections
@classmethod
def extract_inline_formatting(cls, text: str) -> List[Dict[str, Any]]:
"""
提取行内格式信息
Args:
text: 要分析的文本
Returns:
List[Dict[str, Any]]: 格式信息列表
"""
if not text:
return []
formatting = []
# 提取粗体 (**)
for match in cls.PATTERNS['bold_asterisk'].finditer(text):
formatting.append({
'type': 'bold',
'start': match.start(),
'end': match.end(),
'content': match.group(1)
})
# 提取粗体 (__)
for match in cls.PATTERNS['bold_underscore'].finditer(text):
formatting.append({
'type': 'bold',
'start': match.start(),
'end': match.end(),
'content': match.group(1)
})
# 提取斜体 (*)
for match in cls.PATTERNS['italic_asterisk'].finditer(text):
# 检查是否与粗体重叠
overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end']
for f in formatting if f['type'] == 'bold')
if not overlaps:
formatting.append({
'type': 'italic',
'start': match.start(),
'end': match.end(),
'content': match.group(1)
})
# 提取斜体 (_)
for match in cls.PATTERNS['italic_underscore'].finditer(text):
overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end']
for f in formatting if f['type'] in ['bold', 'italic'])
if not overlaps:
formatting.append({
'type': 'italic',
'start': match.start(),
'end': match.end(),
'content': match.group(1)
})
# 提取行内代码
for match in cls.PATTERNS['code_inline'].finditer(text):
formatting.append({
'type': 'code',
'start': match.start(),
'end': match.end(),
'content': match.group(1)
})
# 提取删除线
for match in cls.PATTERNS['strikethrough'].finditer(text):
formatting.append({
'type': 'strikethrough',
'start': match.start(),
'end': match.end(),
'content': match.group(1)
})
# 提取链接
for match in cls.PATTERNS['link'].finditer(text):
formatting.append({
'type': 'link',
'start': match.start(),
'end': match.end(),
'text': match.group(1),
'url': match.group(2)
})
# 按位置排序
formatting.sort(key=lambda x: x['start'])
return formatting
@classmethod
def clean_markdown_text(cls, text: str) -> str:
"""
清理Markdown标记返回纯文本
Args:
text: 包含Markdown标记的文本
Returns:
str: 清理后的纯文本
"""
if not text:
return text
# 移除各种Markdown标记
cleaned = text
# 移除粗体和斜体标记
cleaned = re.sub(r'\*\*(.+?)\*\*', r'\1', cleaned) # **bold**
cleaned = re.sub(r'__(.+?)__', r'\1', cleaned) # __bold__
cleaned = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)', r'\1', cleaned) # *italic*
cleaned = re.sub(r'_(.+?)_', r'\1', cleaned) # _italic_
# 移除行内代码标记
cleaned = re.sub(r'`([^`]+)`', r'\1', cleaned) # `code`
# 移除删除线标记
cleaned = re.sub(r'~~(.+?)~~', r'\1', cleaned) # ~~strikethrough~~
# 移除链接标记,保留链接文本
cleaned = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', cleaned) # [text](url)
# 移除图片标记
cleaned = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', cleaned) # ![alt](url)
return cleaned
@classmethod
def get_text_statistics(cls, text: str) -> Dict[str, int]:
"""
获取Markdown文本的统计信息
Args:
text: Markdown文本
Returns:
Dict[str, int]: 统计信息
"""
if not text:
return {
"total_chars": 0,
"total_lines": 0,
"headings": 0,
"paragraphs": 0,
"code_blocks": 0,
"tables": 0,
"links": 0,
"images": 0
}
stats = {
"total_chars": len(text),
"total_lines": len(text.split('\n')),
"headings": 0,
"paragraphs": 0,
"code_blocks": 0,
"tables": 0,
"links": 0,
"images": 0
}
# 统计各种元素
lines = text.split('\n')
in_code_block = False
for line in lines:
line = line.strip()
if not line:
continue
# 代码块
if line.startswith('```'):
if not in_code_block:
stats["code_blocks"] += 1
in_code_block = not in_code_block
continue
if in_code_block:
continue
# 标题
if cls.PATTERNS['heading'].match(line):
stats["headings"] += 1
continue
# 表格
if cls.PATTERNS['table_row'].match(line):
stats["tables"] += 1
continue
# 普通段落
if not (cls.PATTERNS['unordered_list'].match(line) or
cls.PATTERNS['ordered_list'].match(line) or
cls.PATTERNS['blockquote'].match(line) or
cls.PATTERNS['horizontal_rule'].match(line)):
stats["paragraphs"] += 1
# 统计链接和图片
stats["links"] = len(cls.PATTERNS['link'].findall(text))
stats["images"] = len(cls.PATTERNS['image'].findall(text))
return stats
@classmethod
def validate_markdown(cls, text: str) -> Dict[str, Any]:
"""
验证Markdown格式的有效性
Args:
text: 要验证的Markdown文本
Returns:
Dict[str, Any]: 验证结果
"""
result = {
"valid": True,
"warnings": [],
"errors": []
}
if not text:
result["warnings"].append("文本为空")
return result
lines = text.split('\n')
in_code_block = False
table_started = False
for i, line in enumerate(lines, 1):
line = line.rstrip()
# 检查代码块
if line.strip().startswith('```'):
in_code_block = not in_code_block
continue
if in_code_block:
continue
# 检查表格格式
if cls.PATTERNS['table_row'].match(line):
if not table_started:
table_started = True
# 检查表格格式
if not line.startswith('|') or not line.endswith('|'):
result["warnings"].append(f"{i}行: 表格格式可能不完整")
elif table_started:
table_started = False
# 检查标题格式
heading_match = cls.PATTERNS['heading'].match(line)
if heading_match:
level = len(heading_match.group(2))
if level > 6:
result["warnings"].append(f"{i}行: 标题层级过深 (>{6})")
# 检查未闭合的代码块
if in_code_block:
result["errors"].append("代码块未正确闭合")
result["valid"] = False
return result
# 创建全局解析器实例
markdown_parser = MarkdownParser()
# 兼容旧接口的函数
def parse(txt_content: str) -> List[Dict[str, Any]]:
"""解析Markdown内容兼容旧接口"""
return MarkdownParser.parse(txt_content)
def extract_inline_formatting(text: str) -> List[Dict[str, Any]]:
"""提取行内格式(兼容旧接口)"""
return MarkdownParser.extract_inline_formatting(text)
def group_by_sections(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""按章节分组(兼容旧接口)"""
return MarkdownParser._group_by_sections(elements)