TxT2Docx/docx_generator.py

"""
DOCX文档生成模块

负责将解析后的Markdown结构转换为DOCX文档，包括文本格式化、图片插入和样式设置。
"""

import os
import re
from typing import List, Dict, Any, Callable, Optional
from docx import Document
from docx.document import Document as DocxDocument
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE

from config import config
from text_processor import text_processor
from image_processor import ImageProcessor
from markdown_parser import MarkdownParser
from style_manager import style_manager


# 免责声明文本
DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络，文章旨在传播正能量，均无低俗等不良引导，请观众勿对号入座，并上升到人身攻击等方面。观众理性看待本事件，切勿留下主观臆断的恶意评论，互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题，请及时联系作者，我们将予以删除。`"""


class DocxGenerator:
    """DOCX文档生成器类"""

    def __init__(self):
        """初始化DOCX生成器"""
        self.temp_files = []  # 跟踪临时文件以便清理
        self.current_document_style = None  # 当前使用的文档样式
        self.paragraph_count = 0  # 段落计数器，用于无标题文章的图片插入控制

    def generate(self, sections: List[Dict[str, Any]], image_files: List[str],
                 output_path: str, progress_callback: Optional[Callable] = None) -> bool:
        """
        生成DOCX文档

        Args:
            sections: 解析后的文档章节列表
            image_files: 图片文件路径列表
            output_path: 输出文件路径
            progress_callback: 进度回调函数

        Returns:
            bool: 是否生成成功

        Raises:
            Exception: 生成失败时
        """
        try:
            doc = Document()
            self._setup_document_styles(doc)

            total_sections = len(sections)
            image_index = 0
            image_count = len(image_files)

            for i, section in enumerate(sections):
                if progress_callback:
                    progress = int((i / total_sections) * 100)
                    section_title = section['content'][:30] + "..." if len(section['content']) > 30 else section['content']
                    progress_callback(progress, f"处理章节: {section_title}")

                # 添加章节内容
                image_index = self._add_section_to_doc(doc, section, image_files, image_index, image_count, output_path)

            # 添加免责声明
            if config.add_disclaimer:
                self._add_disclaimer(doc)

            # 保存文档
            doc.save(output_path)

            if progress_callback:
                progress_callback(100, "转换完成!")

            return True

        except Exception as e:
            raise Exception(f"生成DOCX失败: {str(e)}")
        finally:
            # 清理临时文件
            self._cleanup_temp_files()

    def _setup_document_styles(self, doc) -> None:
        """
        设置文档样式

        Args:
            doc: DOCX文档对象
        """
        try:
            # 获取当前选中的样式
            current_style = style_manager.get_style(config.current_style)
            if not current_style:
                print(f"警告: 找不到样式 '{config.current_style}'，使用默认样式")
                return

            self.current_document_style = current_style
            print(f"应用文档样式: {current_style.name}")

        except Exception as e:
            print(f"设置文档样式时出错: {e}")

    def _add_section_to_doc(self, doc: DocxDocument, section: Dict[str, Any],
                           image_files: List[str], image_index: int, image_count: int,
                           output_path: str) -> int:
        """
        添加章节内容到文档

        Args:
            doc: DOCX文档对象
            section: 章节数据
            image_files: 图片文件列表
            image_index: 当前图片索引
            image_count: 图片总数
            output_path: 输出文件路径（用于临时文件）

        Returns:
            int: 更新后的图片索引
        """
        # 添加章节标题
        if section['level'] > 0 and section['level'] <= config.title_levels:
            heading_text = text_processor.process_text_content(section['content'])
            para = doc.add_heading(level=section['level'])
            # 清空默认内容，应用自定义样式
            para.clear()
            run = para.add_run(heading_text)

            # 应用标题样式
            if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.heading_styles:
                if section['level'] in self.current_document_style.heading_styles:
                    heading_style = self.current_document_style.heading_styles[section['level']]
                    if heading_style.font:
                        run.font.name = heading_style.font.name
                        run.font.size = Pt(heading_style.font.size)
                        run.font.bold = heading_style.font.bold
                        run.font.italic = heading_style.font.italic
                        if heading_style.font.color != "#000000":
                            run.font.color.rgb = RGBColor.from_string(heading_style.font.color.replace('#', ''))

                    if heading_style.paragraph:
                        para_style = heading_style.paragraph
                        if para_style.line_spacing > 0:
                            para.paragraph_format.line_spacing = para_style.line_spacing
                        if para_style.space_before > 0:
                            para.paragraph_format.space_before = Pt(para_style.space_before)
                        if para_style.space_after > 0:
                            para.paragraph_format.space_after = Pt(para_style.space_after)
                        if para_style.first_line_indent > 0:
                            para.paragraph_format.first_line_indent = Pt(para_style.first_line_indent * 12)

                        # 设置对齐方式
                        if para_style.alignment == "center":
                            para.alignment = WD_ALIGN_PARAGRAPH.CENTER
                        elif para_style.alignment == "right":
                            para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                        elif para_style.alignment == "justify":
                            para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                        else:
                            para.alignment = WD_ALIGN_PARAGRAPH.LEFT
            else:
                # 默认样式
                run.font.size = Pt(18 - section['level'] * 2 if section['level'] <= 6 else 10)
                run.font.bold = True

            self._apply_inline_formatting(para, heading_text)

            # 如果有标题，根据配置决定在标题前还是后插入图片
            if image_count > 0 and image_index < image_count:
                # 检查是否需要在标题前插入图片
                if hasattr(config, 'image_insert_position') and config.image_insert_position == "before_title":
                    image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)
        elif section['content'] != '前置内容':
            heading_text = text_processor.process_text_content(section['content'])
            para = doc.add_paragraph()
            run = para.add_run(heading_text)

            # 应用样式设置
            if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.heading_styles:
                if section['level'] in self.current_document_style.heading_styles:
                    heading_style = self.current_document_style.heading_styles[section['level']]
                    if heading_style.font:
                        run.font.name = heading_style.font.name
                        run.font.size = Pt(heading_style.font.size)
                        run.font.bold = heading_style.font.bold
                        run.font.italic = heading_style.font.italic
                        if heading_style.font.color != "#000000":
                            run.font.color.rgb = RGBColor.from_string(heading_style.font.color.replace('#', ''))
            else:
                run.font.size = Pt(14)
                run.font.bold = True

            para.paragraph_format.space_after = Pt(12)

        # 处理章节中的元素
        elements = section.get('elements', [])
        if not elements:
            return image_index

        # 处理元素
        for element in elements:
            # 添加元素到文档
            self._add_element_to_doc(doc, element)

            # 根据文章结构决定图片插入策略
            if element['type'] not in ['empty']:
                # 如果有标题，根据配置决定在标题后插入图片
                if section['level'] > 0 and section['level'] <= config.title_levels:
                    # 有标题的文章，在标题后的第一个内容后插入图片
                    if hasattr(config, 'image_insert_position') and config.image_insert_position == "after_title":
                        image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)
                        # 插入一次后就不再插入，直到下一个标题
                        break
                else:
                    # 无标题的文章，根据段落计数控制图片插入间隔
                    self.paragraph_count += 1
                    if image_count > 0 and image_index < image_count:
                        # 检查是否需要插入图片（根据配置的间隔）
                        image_insert_interval = getattr(config, 'image_insert_interval', 5)  # 默认每5段插入一张图片
                        if self.paragraph_count % image_insert_interval == 0:
                            image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)

        return image_index

    def _add_element_to_doc(self, doc: DocxDocument, element: Dict[str, Any]) -> None:
        """
        将解析的元素添加到文档中

        Args:
            doc: DOCX文档对象
            element: 元素数据
        """
        element_type = element['type']
        content = text_processor.process_text_content(element.get('content', ''))

        if element_type == 'paragraph':
            self._add_formatted_paragraph(doc, content)

        elif element_type == 'unordered_list':
            para = doc.add_paragraph(style='List Bullet')
            self._apply_inline_formatting(para, content)
            # 应用列表样式
            self._apply_list_style(para, 'unordered')

        elif element_type == 'ordered_list':
            para = doc.add_paragraph(style='List Number')
            self._apply_inline_formatting(para, content)
            # 应用列表样式
            self._apply_list_style(para, 'ordered')

        elif element_type == 'blockquote':
            para = doc.add_paragraph(style='Quote')
            self._apply_inline_formatting(para, content)
            # 应用引用样式
            self._apply_quote_style(para)

        elif element_type == 'code_block':
            self._add_code_block(doc, element.get('content', ''), element.get('language', ''))

        elif element_type == 'table':
            self._add_table_to_doc(doc, element.get('rows', []))

        elif element_type == 'horizontal_rule':
            self._add_horizontal_rule(doc)

        elif element_type == 'empty':
            doc.add_paragraph()

    def _apply_list_style(self, paragraph, list_type: str) -> None:
        """
        应用列表样式到段落

        Args:
            paragraph: DOCX段落对象
            list_type: 列表类型 ('unordered' 或 'ordered')
        """
        if not (hasattr(self, 'current_document_style') and self.current_document_style):
            return

        list_style = None
        if list_type == 'unordered' and self.current_document_style.unordered_list:
            list_style = self.current_document_style.unordered_list
        elif list_type == 'ordered' and self.current_document_style.ordered_list:
            list_style = self.current_document_style.ordered_list

        if list_style and list_style.paragraph:
            if list_style.paragraph.space_before > 0:
                paragraph.paragraph_format.space_before = Pt(list_style.paragraph.space_before)
            if list_style.paragraph.space_after > 0:
                paragraph.paragraph_format.space_after = Pt(list_style.paragraph.space_after)

    def _apply_quote_style(self, paragraph) -> None:
        """
        应用引用块样式到段落

        Args:
            paragraph: DOCX段落对象
        """
        if not (hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.quote_block):
            return

        quote_style = self.current_document_style.quote_block
        if quote_style.paragraph:
            if quote_style.paragraph.line_spacing > 0:
                paragraph.paragraph_format.line_spacing = quote_style.paragraph.line_spacing
            if quote_style.paragraph.space_before > 0:
                paragraph.paragraph_format.space_before = Pt(quote_style.paragraph.space_before)
            if quote_style.paragraph.space_after > 0:
                paragraph.paragraph_format.space_after = Pt(quote_style.paragraph.space_after)
            if quote_style.paragraph.first_line_indent > 0:
                paragraph.paragraph_format.first_line_indent = Pt(quote_style.paragraph.first_line_indent * 12)

            # 设置对齐方式
            if quote_style.paragraph.alignment == "center":
                paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
            elif quote_style.paragraph.alignment == "right":
                paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            elif quote_style.paragraph.alignment == "justify":
                paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

    def _add_formatted_paragraph(self, doc: DocxDocument, content: str) -> None:
        """
        添加带格式的段落

        Args:
            doc: DOCX文档对象
            content: 段落内容
        """
        if not content or not content.strip():
            doc.add_paragraph()
            return

        para = doc.add_paragraph()
        self._apply_inline_formatting(para, content)

        # 应用样式中的段落格式
        if hasattr(self, 'current_document_style') and self.current_document_style:
            if self.current_document_style.body_paragraph:
                body_para = self.current_document_style.body_paragraph
                if body_para.line_spacing > 0:
                    para.paragraph_format.line_spacing = body_para.line_spacing
                if body_para.space_before > 0:
                    para.paragraph_format.space_before = Pt(body_para.space_before)
                if body_para.space_after > 0:
                    para.paragraph_format.space_after = Pt(body_para.space_after)
                if body_para.first_line_indent > 0:
                    para.paragraph_format.first_line_indent = Pt(body_para.first_line_indent * 12)  # 字符转磅

                # 设置对齐方式
                if body_para.alignment == "center":
                    para.alignment = WD_ALIGN_PARAGRAPH.CENTER
                elif body_para.alignment == "right":
                    para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                elif body_para.alignment == "justify":
                    para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                else:
                    para.alignment = WD_ALIGN_PARAGRAPH.LEFT
        elif config.line_spacing > 0:
            para.paragraph_format.line_spacing = config.line_spacing

    def _apply_inline_formatting(self, paragraph, text: str) -> None:
        """
        应用行内格式到段落

        Args:
            paragraph: DOCX段落对象
            text: 要格式化的文本
        """
        # 首先处理文字内容（已在调用前处理）
        processed_text = text

        # 提取格式信息
        formatting = MarkdownParser.extract_inline_formatting(processed_text)

        # 如果没有格式，直接添加文本
        if not formatting:
            run = paragraph.add_run(processed_text)
            self._apply_body_font_style(run)
            return

        current_pos = 0

        for fmt in formatting:
            # 添加格式前的普通文本
            if fmt['start'] > current_pos:
                run = paragraph.add_run(processed_text[current_pos:fmt['start']])
                self._apply_body_font_style(run)

            # 创建格式化的run
            if fmt['type'] == 'bold':
                clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']])
                run = paragraph.add_run(clean_text)
                self._apply_body_font_style(run)
                run.bold = True

            elif fmt['type'] == 'italic':
                clean_text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)|_(.+?)_', r'\1\2',
                                    processed_text[fmt['start']:fmt['end']])
                run = paragraph.add_run(clean_text)
                self._apply_body_font_style(run)
                run.italic = True

            elif fmt['type'] == 'code':
                clean_text = re.sub(r'`([^`]+)`', r'\1', processed_text[fmt['start']:fmt['end']])
                run = paragraph.add_run(clean_text)
                # 代码样式优先使用样式中的设置
                if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.code_block:
                    code_style = self.current_document_style.code_block
                    if code_style.font:
                        run.font.name = code_style.font.name
                        run.font.size = Pt(code_style.font.size)
                        if code_style.font.color != "#000000":
                            run.font.color.rgb = RGBColor.from_string(code_style.font.color.replace('#', ''))
                else:
                    run.font.name = 'Courier New'
                    run.font.size = Pt(10)

            elif fmt['type'] == 'strikethrough':
                clean_text = re.sub(r'~~(.+?)~~', r'\1', processed_text[fmt['start']:fmt['end']])
                run = paragraph.add_run(clean_text)
                self._apply_body_font_style(run)
                run.font.strike = True

            elif fmt['type'] == 'link':
                # 对于链接，只显示链接文本
                run = paragraph.add_run(fmt['text'])
                self._apply_body_font_style(run)
                run.font.color.rgb = RGBColor(0, 0, 255)  # 蓝色
                run.underline = True

            current_pos = fmt['end']

        # 添加剩余的普通文本
        if current_pos < len(processed_text):
            run = paragraph.add_run(processed_text[current_pos:])
            self._apply_body_font_style(run)

    def _apply_body_font_style(self, run) -> None:
        """
        应用正文字体样式到run

        Args:
            run: DOCX run对象
        """
        if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.body_font:
            body_font = self.current_document_style.body_font
            run.font.name = body_font.name
            run.font.size = Pt(body_font.size)
            run.font.bold = body_font.bold
            run.font.italic = body_font.italic
            if body_font.color != "#000000":
                run.font.color.rgb = RGBColor.from_string(body_font.color.replace('#', ''))

    def _add_code_block(self, doc: DocxDocument, content: str, language: str) -> None:
        """
        添加代码块

        Args:
            doc: DOCX文档对象
            content: 代码内容
            language: 编程语言
        """
        para = doc.add_paragraph(style='No Spacing')
        run = para.add_run(content)

        # 应用代码块样式
        if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.code_block:
            code_style = self.current_document_style.code_block
            if code_style.font:
                run.font.name = code_style.font.name
                run.font.size = Pt(code_style.font.size)
                run.font.bold = code_style.font.bold
                run.font.italic = code_style.font.italic
                if code_style.font.color != "#000000":
                    run.font.color.rgb = RGBColor.from_string(code_style.font.color.replace('#', ''))

            if code_style.paragraph:
                para_style = code_style.paragraph
                if para_style.space_before > 0:
                    para.paragraph_format.space_before = Pt(para_style.space_before)
                if para_style.space_after > 0:
                    para.paragraph_format.space_after = Pt(para_style.space_after)
        else:
            # 默认样式
            run.font.name = 'Courier New'
            run.font.size = Pt(10)
            para.paragraph_format.space_before = Pt(6)
            para.paragraph_format.space_after = Pt(6)

    def _add_table_to_doc(self, doc: DocxDocument, rows: List[List[str]]) -> None:
        """
        添加表格到文档

        Args:
            doc: DOCX文档对象
            rows: 表格行数据
        """
        if not rows:
            return

        table = doc.add_table(rows=len(rows), cols=len(rows[0]))
        table.style = 'Table Grid'

        for i, row_data in enumerate(rows):
            row_cells = table.rows[i].cells
            for j, cell_data in enumerate(row_data):
                if j < len(row_cells):
                    processed_text = text_processor.process_text_content(cell_data)
                    cell_para = row_cells[j].paragraphs[0]
                    cell_para.clear()
                    run = cell_para.add_run(processed_text)

                    # 应用表格样式
                    if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.table_style:
                        table_style = self.current_document_style.table_style
                        if table_style.font:
                            run.font.name = table_style.font.name
                            run.font.size = Pt(table_style.font.size)
                            run.font.bold = table_style.font.bold
                            run.font.italic = table_style.font.italic
                            if table_style.font.color != "#000000":
                                run.font.color.rgb = RGBColor.from_string(table_style.font.color.replace('#', ''))

                        if table_style.paragraph:
                            para_style = table_style.paragraph
                            if para_style.space_before > 0:
                                cell_para.paragraph_format.space_before = Pt(para_style.space_before)
                            if para_style.space_after > 0:
                                cell_para.paragraph_format.space_after = Pt(para_style.space_after)

    def _add_horizontal_rule(self, doc: DocxDocument) -> None:
        """
        在文档中添加横线

        Args:
            doc: DOCX文档对象
        """
        para = doc.add_paragraph()
        run = para.add_run()
        run.font.underline = True
        run.text = " " * 100  # 足够长的下划线作为横线
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER

    def _insert_section_image(self, doc: DocxDocument, image_files: List[str],
                             image_index: int, image_count: int, output_path: str) -> int:
        """
        为章节插入图片

        Args:
            doc: DOCX文档对象
            image_files: 图片文件列表
            image_index: 当前图片索引
            image_count: 图片总数
            output_path: 输出文件路径

        Returns:
            int: 更新后的图片索引
        """
        if image_count > 0 and image_index < image_count:
            try:
                self._insert_image(doc, image_files[image_index], output_path)
                image_index += 1

                # 根据策略处理图片不足的情况
                if image_index >= image_count:
                    if config.image_strategy == "cycle":
                        image_index = 0
                    elif config.image_strategy == "truncate":
                        image_index = image_count
                    # repeat_last策略：保持当前索引-1，下次还用最后一张

            except Exception as e:
                # 插入失败时添加错误提示
                para = doc.add_paragraph()
                run = para.add_run(f"[图片插入失败: {str(e)}]")
                run.font.color.rgb = RGBColor(255, 0, 0)  # 红色

        return image_index

    def _insert_image(self, doc: DocxDocument, image_path: str, output_path: str) -> None:
        """
        插入图片到文档

        Args:
            doc: DOCX文档对象
            image_path: 图片文件路径
            output_path: 输出文件路径（用于临时文件）
        """
        try:
            # 使用优化方法处理图片
            temp_dir = os.path.join(os.path.dirname(output_path), "temp_images")
            os.makedirs(temp_dir, exist_ok=True)
            optimized_image_path = ImageProcessor.optimize_image_for_docx(image_path, temp_dir)

            # 处理图片（方向修正和尺寸调整）
            img, width = ImageProcessor.process_image(optimized_image_path)

            temp_img_path = None
            if config.image_resize == "width":
                # 需要保存临时图片
                temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png")
                img.save(temp_img_path)
                self.temp_files.append(temp_img_path)
                img_path = temp_img_path
            else:
                img_path = optimized_image_path if optimized_image_path != image_path else image_path

            # 创建段落并插入图片
            para = doc.add_paragraph()
            run = para.runs[0] if para.runs else para.add_run()
            run.add_picture(img_path, width=Inches(width))
            para.alignment = ImageProcessor.get_image_alignment()

        except Exception as e:
            raise Exception(f"插入图片失败: {str(e)}")

    def _add_disclaimer(self, doc: DocxDocument) -> None:
        """
        添加免责声明

        Args:
            doc: DOCX文档对象
        """
        doc.add_paragraph("---")
        para = doc.add_paragraph()
        disclaimer_text = text_processor.process_text_content(DISCLAIMER_TEXT)
        run = para.add_run(disclaimer_text)
        run.font.size = Pt(10)
        para.paragraph_format.line_spacing = 1.0

    def _cleanup_temp_files(self) -> None:
        """清理临时文件"""
        for temp_file in self.temp_files:
            try:
                if os.path.exists(temp_file):
                    os.remove(temp_file)
            except Exception as e:
                print(f"清理临时文件失败 {temp_file}: {e}")
        self.temp_files.clear()


# 创建全局DOCX生成器实例
docx_generator = DocxGenerator()


# 兼容旧接口的函数
def generate(sections: List[Dict[str, Any]], image_files: List[str],
             output_path: str, progress_callback: Optional[Callable] = None) -> bool:
    """生成DOCX文档（兼容旧接口）"""
    return docx_generator.generate(sections, image_files, output_path, progress_callback)