TxT2Docx/markdown_parser.py

"""
Markdown解析模块

负责解析Markdown格式的文本，提取结构化信息和内联格式。
支持标题、列表、代码块、表格、链接等常见Markdown元素。
"""

import re
from typing import List, Dict, Any
from config import config


class MarkdownParser:
    """Markdown解析器类"""

    # Markdown格式匹配模式
    PATTERNS = {
        'heading': re.compile(r'^(\s*)(#{1,6})\s+(.+)$'),
        'bold_asterisk': re.compile(r'\*\*(.+?)\*\*'),
        'bold_underscore': re.compile(r'__(.+?)__'),
        'italic_asterisk': re.compile(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)'),
        'italic_underscore': re.compile(r'_(.+?)_'),
        'code_inline': re.compile(r'`([^`]+)`'),
        'code_block': re.compile(r'^```(\w+)?\s*\n(.*?)\n```', re.MULTILINE | re.DOTALL),
        'strikethrough': re.compile(r'~~(.+?)~~'),
        'link': re.compile(r'\[([^\]]+)\]\(([^)]+)\)'),
        'image': re.compile(r'!\[([^\]]*)\]\(([^)]+)\)'),
        'unordered_list': re.compile(r'^\s*[-*+]\s+(.+)$'),
        'ordered_list': re.compile(r'^\s*\d+\.\s+(.+)$'),
        'blockquote': re.compile(r'^\s*>\s*(.+)$'),
        'horizontal_rule': re.compile(r'^(\s*[-*_]){3,}\s*$'),
        'table_row': re.compile(r'^\|(.+)\|$'),
        'table_separator': re.compile(r'^\|(\s*:?-+:?\s*\|)+$')
    }

    @classmethod
    def parse(cls, txt_content: str) -> List[Dict[str, Any]]:
        """
        解析Markdown内容为结构化数据

        Args:
            txt_content: Markdown文本内容

        Returns:
            List[Dict[str, Any]]: 解析后的结构化数据列表
        """
        if not txt_content:
            return []

        elements = cls._parse_elements(txt_content)
        sections = cls._group_by_sections(elements)
        return sections

    @classmethod
    def _parse_elements(cls, txt_content: str) -> List[Dict[str, Any]]:
        """
        解析Markdown内容为元素列表

        Args:
            txt_content: Markdown文本内容

        Returns:
            List[Dict[str, Any]]: 元素列表
        """
        elements = []
        lines = txt_content.split('\n')
        i = 0
        in_code_block = False
        code_block_content = []
        table_mode = False
        table_rows = []
        code_block_language = ""

        while i < len(lines):
            line = lines[i].rstrip('\r')

            # 处理代码块
            if line.strip().startswith('```'):
                if not in_code_block:
                    in_code_block = True
                    code_block_language = line.strip()[3:].strip()
                    code_block_content = []
                else:
                    in_code_block = False
                    elements.append({
                        'type': 'code_block',
                        'language': code_block_language,
                        'content': '\n'.join(code_block_content),
                        'level': 0
                    })
                    code_block_content = []
                    code_block_language = ""
                i += 1
                continue

            if in_code_block:
                code_block_content.append(line)
                i += 1
                continue

            # 处理表格
            table_match = cls.PATTERNS['table_row'].match(line)
            table_sep_match = cls.PATTERNS['table_separator'].match(line)

            if table_match or table_sep_match:
                if not table_mode:
                    table_mode = True
                    table_rows = []

                if table_match and not table_sep_match:
                    cells = [cell.strip() for cell in table_match.group(1).split('|')]
                    table_rows.append(cells)

                i += 1
                continue
            elif table_mode:
                # 表格结束
                if table_rows:
                    elements.append({
                        'type': 'table',
                        'rows': table_rows,
                        'level': 0
                    })
                table_mode = False
                table_rows = []

            # 处理标题
            heading_match = cls.PATTERNS['heading'].match(line)
            if heading_match:
                level = len(heading_match.group(2))
                if level <= config.title_levels:
                    heading_text = heading_match.group(3).strip()
                    # 先移除Markdown标记但保留文本内容
                    cleaned_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', heading_text)
                    elements.append({
                        'type': 'heading',
                        'level': level,
                        'content': heading_text,  # 保留原始内容用于格式处理
                        'cleaned_content': cleaned_text  # 用于显示的纯文本
                    })
                i += 1
                continue

            # 处理水平分隔线
            if cls.PATTERNS['horizontal_rule'].match(line):
                elements.append({
                    'type': 'horizontal_rule',
                    'level': 0
                })
                i += 1
                continue

            # 处理列表
            ul_match = cls.PATTERNS['unordered_list'].match(line)
            ol_match = cls.PATTERNS['ordered_list'].match(line)

            if ul_match:
                elements.append({
                    'type': 'unordered_list',
                    'content': ul_match.group(1),
                    'level': 0
                })
                i += 1
                continue

            if ol_match:
                elements.append({
                    'type': 'ordered_list',
                    'content': ol_match.group(1),
                    'level': 0
                })
                i += 1
                continue

            # 处理引用
            quote_match = cls.PATTERNS['blockquote'].match(line)
            if quote_match:
                elements.append({
                    'type': 'blockquote',
                    'content': quote_match.group(1),
                    'level': 0
                })
                i += 1
                continue

            # 处理空行
            if line.strip() == '':
                elements.append({
                    'type': 'empty',
                    'content': '',
                    'level': 0
                })
                i += 1
                continue

            # 处理普通段落
            elements.append({
                'type': 'paragraph',
                'content': line,
                'level': 0
            })

            i += 1

        # 处理剩余的表格
        if table_mode and table_rows:
            elements.append({
                'type': 'table',
                'rows': table_rows,
                'level': 0
            })

        return elements

    @classmethod
    def _group_by_sections(cls, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        将解析的元素按标题分组

        Args:
            elements: 元素列表

        Returns:
            List[Dict[str, Any]]: 按章节分组的结构
        """
        sections = []
        current_section = {
            'type': 'section',
            'level': 0,
            'content': '前置内容',
            'elements': []
        }

        for element in elements:
            if element['type'] == 'heading':
                # 保存当前section
                if current_section['elements']:
                    sections.append(current_section)

                # 创建新section
                current_section = {
                    'type': 'section',
                    'level': element['level'],
                    'content': element['content'],
                    'elements': []
                }
            else:
                current_section['elements'].append(element)

        # 添加最后一个section
        if current_section['elements'] or current_section['content'] != '前置内容':
            sections.append(current_section)

        return sections

    @classmethod
    def extract_inline_formatting(cls, text: str) -> List[Dict[str, Any]]:
        """
        提取行内格式信息

        Args:
            text: 要分析的文本

        Returns:
            List[Dict[str, Any]]: 格式信息列表
        """
        if not text:
            return []

        formatting = []

        # 提取粗体 (**)
        for match in cls.PATTERNS['bold_asterisk'].finditer(text):
            formatting.append({
                'type': 'bold',
                'start': match.start(),
                'end': match.end(),
                'content': match.group(1)
            })

        # 提取粗体 (__)
        for match in cls.PATTERNS['bold_underscore'].finditer(text):
            formatting.append({
                'type': 'bold',
                'start': match.start(),
                'end': match.end(),
                'content': match.group(1)
            })

        # 提取斜体 (*)
        for match in cls.PATTERNS['italic_asterisk'].finditer(text):
            # 检查是否与粗体重叠
            overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end']
                           for f in formatting if f['type'] == 'bold')
            if not overlaps:
                formatting.append({
                    'type': 'italic',
                    'start': match.start(),
                    'end': match.end(),
                    'content': match.group(1)
                })

        # 提取斜体 (_)
        for match in cls.PATTERNS['italic_underscore'].finditer(text):
            overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end']
                           for f in formatting if f['type'] in ['bold', 'italic'])
            if not overlaps:
                formatting.append({
                    'type': 'italic',
                    'start': match.start(),
                    'end': match.end(),
                    'content': match.group(1)
                })

        # 提取行内代码
        for match in cls.PATTERNS['code_inline'].finditer(text):
            formatting.append({
                'type': 'code',
                'start': match.start(),
                'end': match.end(),
                'content': match.group(1)
            })

        # 提取删除线
        for match in cls.PATTERNS['strikethrough'].finditer(text):
            formatting.append({
                'type': 'strikethrough',
                'start': match.start(),
                'end': match.end(),
                'content': match.group(1)
            })

        # 提取链接
        for match in cls.PATTERNS['link'].finditer(text):
            formatting.append({
                'type': 'link',
                'start': match.start(),
                'end': match.end(),
                'text': match.group(1),
                'url': match.group(2)
            })

        # 按位置排序
        formatting.sort(key=lambda x: x['start'])
        return formatting

    @classmethod
    def clean_markdown_text(cls, text: str) -> str:
        """
        清理Markdown标记，返回纯文本

        Args:
            text: 包含Markdown标记的文本

        Returns:
            str: 清理后的纯文本
        """
        if not text:
            return text

        # 移除各种Markdown标记
        cleaned = text

        # 移除粗体和斜体标记
        cleaned = re.sub(r'\*\*(.+?)\*\*', r'\1', cleaned)  # **bold**
        cleaned = re.sub(r'__(.+?)__', r'\1', cleaned)      # __bold__
        cleaned = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)', r'\1', cleaned)  # *italic*
        cleaned = re.sub(r'_(.+?)_', r'\1', cleaned)        # _italic_

        # 移除行内代码标记
        cleaned = re.sub(r'`([^`]+)`', r'\1', cleaned)      # `code`

        # 移除删除线标记
        cleaned = re.sub(r'~~(.+?)~~', r'\1', cleaned)      # ~~strikethrough~~

        # 移除链接标记，保留链接文本
        cleaned = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', cleaned)  # [text](url)

        # 移除图片标记
        cleaned = re.sub(r'!\[([^\]]*)\]\([^)]+\)', r'\1', cleaned)  # ![alt](url)

        return cleaned

    @classmethod
    def get_text_statistics(cls, text: str) -> Dict[str, int]:
        """
        获取Markdown文本的统计信息

        Args:
            text: Markdown文本

        Returns:
            Dict[str, int]: 统计信息
        """
        if not text:
            return {
                "total_chars": 0,
                "total_lines": 0,
                "headings": 0,
                "paragraphs": 0,
                "code_blocks": 0,
                "tables": 0,
                "links": 0,
                "images": 0
            }

        stats = {
            "total_chars": len(text),
            "total_lines": len(text.split('\n')),
            "headings": 0,
            "paragraphs": 0,
            "code_blocks": 0,
            "tables": 0,
            "links": 0,
            "images": 0
        }

        # 统计各种元素
        lines = text.split('\n')
        in_code_block = False

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # 代码块
            if line.startswith('```'):
                if not in_code_block:
                    stats["code_blocks"] += 1
                in_code_block = not in_code_block
                continue

            if in_code_block:
                continue

            # 标题
            if cls.PATTERNS['heading'].match(line):
                stats["headings"] += 1
                continue

            # 表格
            if cls.PATTERNS['table_row'].match(line):
                stats["tables"] += 1
                continue

            # 普通段落
            if not (cls.PATTERNS['unordered_list'].match(line) or
                   cls.PATTERNS['ordered_list'].match(line) or
                   cls.PATTERNS['blockquote'].match(line) or
                   cls.PATTERNS['horizontal_rule'].match(line)):
                stats["paragraphs"] += 1

        # 统计链接和图片
        stats["links"] = len(cls.PATTERNS['link'].findall(text))
        stats["images"] = len(cls.PATTERNS['image'].findall(text))

        return stats

    @classmethod
    def validate_markdown(cls, text: str) -> Dict[str, Any]:
        """
        验证Markdown格式的有效性

        Args:
            text: 要验证的Markdown文本

        Returns:
            Dict[str, Any]: 验证结果
        """
        result = {
            "valid": True,
            "warnings": [],
            "errors": []
        }

        if not text:
            result["warnings"].append("文本为空")
            return result

        lines = text.split('\n')
        in_code_block = False
        table_started = False

        for i, line in enumerate(lines, 1):
            line = line.rstrip()

            # 检查代码块
            if line.strip().startswith('```'):
                in_code_block = not in_code_block
                continue

            if in_code_block:
                continue

            # 检查表格格式
            if cls.PATTERNS['table_row'].match(line):
                if not table_started:
                    table_started = True
                # 检查表格格式
                if not line.startswith('|') or not line.endswith('|'):
                    result["warnings"].append(f"第{i}行: 表格格式可能不完整")
            elif table_started:
                table_started = False

            # 检查标题格式
            heading_match = cls.PATTERNS['heading'].match(line)
            if heading_match:
                level = len(heading_match.group(2))
                if level > 6:
                    result["warnings"].append(f"第{i}行: 标题层级过深 (>{6})")

        # 检查未闭合的代码块
        if in_code_block:
            result["errors"].append("代码块未正确闭合")
            result["valid"] = False

        return result


# 创建全局解析器实例
markdown_parser = MarkdownParser()


# 兼容旧接口的函数
def parse(txt_content: str) -> List[Dict[str, Any]]:
    """解析Markdown内容（兼容旧接口）"""
    return MarkdownParser.parse(txt_content)


def extract_inline_formatting(text: str) -> List[Dict[str, Any]]:
    """提取行内格式（兼容旧接口）"""
    return MarkdownParser.extract_inline_formatting(text)


def group_by_sections(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """按章节分组（兼容旧接口）"""
    return MarkdownParser._group_by_sections(elements)