TxT2Docx/docx_generator.py

"""
DOCX文档生成模块

负责将解析后的Markdown结构转换为DOCX文档，包括文本格式化、图片插入和样式设置。
"""

import os
import re
from typing import List, Dict, Any, Callable, Optional
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE

from config import config
from text_processor import text_processor
from image_processor import ImageProcessor
from markdown_parser import MarkdownParser


# 免责声明文本
DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络，文章旨在传播正能量，均无低俗等不良引导，请观众勿对号入座，并上升到人身攻击等方面。观众理性看待本事件，切勿留下主观臆断的恶意评论，互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题，请及时联系作者，我们将予以删除。`"""


class DocxGenerator:
    """DOCX文档生成器类"""

    def __init__(self):
        """初始化DOCX生成器"""
        self.temp_files = []  # 跟踪临时文件以便清理

    def generate(self, sections: List[Dict[str, Any]], image_files: List[str], 
                 output_path: str, progress_callback: Optional[Callable] = None) -> bool:
        """
        生成DOCX文档
        
        Args:
            sections: 解析后的文档章节列表
            image_files: 图片文件路径列表
            output_path: 输出文件路径
            progress_callback: 进度回调函数
            
        Returns:
            bool: 是否生成成功
            
        Raises:
            Exception: 生成失败时
        """
        try:
            doc = Document()
            self._setup_document_styles(doc)
            
            total_sections = len(sections)
            image_index = 0
            image_count = len(image_files)

            for i, section in enumerate(sections):
                if progress_callback:
                    progress = int((i / total_sections) * 100)
                    section_title = section['content'][:30] + "..." if len(section['content']) > 30 else section['content']
                    progress_callback(progress, f"处理章节: {section_title}")

                # 添加章节内容
                image_index = self._add_section_to_doc(doc, section, image_files, image_index, image_count, output_path)

            # 添加免责声明
            if config.add_disclaimer:
                self._add_disclaimer(doc)

            # 保存文档
            doc.save(output_path)
            
            if progress_callback:
                progress_callback(100, "转换完成!")
                
            return True
            
        except Exception as e:
            raise Exception(f"生成DOCX失败: {str(e)}")
        finally:
            # 清理临时文件
            self._cleanup_temp_files()

    def _setup_document_styles(self, doc: Document) -> None:
        """
        设置文档样式
        
        Args:
            doc: DOCX文档对象
        """
        try:
            # 设置默认字体和行距
            styles = doc.styles
            
            # 设置正文样式
            if 'Normal' in styles:
                normal_style = styles['Normal']
                if config.line_spacing > 0:
                    normal_style.paragraph_format.line_spacing = config.line_spacing
                    
        except Exception as e:
            print(f"设置文档样式时出错: {e}")

    def _add_section_to_doc(self, doc: Document, section: Dict[str, Any], 
                           image_files: List[str], image_index: int, image_count: int, 
                           output_path: str) -> int:
        """
        添加章节内容到文档
        
        Args:
            doc: DOCX文档对象
            section: 章节数据
            image_files: 图片文件列表
            image_index: 当前图片索引
            image_count: 图片总数
            output_path: 输出文件路径（用于临时文件）
            
        Returns:
            int: 更新后的图片索引
        """
        # 添加章节标题
        if section['level'] > 0 and section['level'] <= config.title_levels:
            heading_text = text_processor.process_text_content(section['content'])
            para = doc.add_heading(level=section['level'])
            self._apply_inline_formatting(para, heading_text)
        elif section['content'] != '前置内容':
            heading_text = text_processor.process_text_content(section['content'])
            para = doc.add_paragraph()
            run = para.add_run(heading_text)
            run.font.size = Pt(14)
            run.font.bold = True
            para.space_after = Pt(12)

        # 处理章节中的元素
        elements = section.get('elements', [])
        if not elements:
            return image_index

        # 处理第一个非空元素后插入图片
        first_content_added = False

        for element in elements:
            # 添加元素到文档
            self._add_element_to_doc(doc, element)

            # 在第一个内容元素后插入图片
            if not first_content_added and element['type'] not in ['empty']:
                first_content_added = True
                image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)

        return image_index

    def _add_element_to_doc(self, doc: Document, element: Dict[str, Any]) -> None:
        """
        将解析的元素添加到文档中
        
        Args:
            doc: DOCX文档对象
            element: 元素数据
        """
        element_type = element['type']
        content = text_processor.process_text_content(element.get('content', ''))

        if element_type == 'paragraph':
            self._add_formatted_paragraph(doc, content)

        elif element_type == 'unordered_list':
            para = doc.add_paragraph(style='List Bullet')
            self._apply_inline_formatting(para, content)

        elif element_type == 'ordered_list':
            para = doc.add_paragraph(style='List Number')
            self._apply_inline_formatting(para, content)

        elif element_type == 'blockquote':
            para = doc.add_paragraph(style='Quote')
            self._apply_inline_formatting(para, content)

        elif element_type == 'code_block':
            self._add_code_block(doc, element.get('content', ''), element.get('language', ''))

        elif element_type == 'table':
            self._add_table_to_doc(doc, element.get('rows', []))

        elif element_type == 'horizontal_rule':
            self._add_horizontal_rule(doc)

        elif element_type == 'empty':
            doc.add_paragraph()

    def _add_formatted_paragraph(self, doc: Document, content: str) -> None:
        """
        添加带格式的段落
        
        Args:
            doc: DOCX文档对象
            content: 段落内容
        """
        if not content or not content.strip():
            doc.add_paragraph()
            return

        para = doc.add_paragraph()
        self._apply_inline_formatting(para, content)

        if config.line_spacing > 0:
            para.paragraph_format.line_spacing = config.line_spacing

    def _apply_inline_formatting(self, paragraph, text: str) -> None:
        """
        应用行内格式到段落
        
        Args:
            paragraph: DOCX段落对象
            text: 要格式化的文本
        """
        # 首先处理文字内容（已在调用前处理）
        processed_text = text

        # 提取格式信息
        formatting = MarkdownParser.extract_inline_formatting(processed_text)

        # 如果没有格式，直接添加文本
        if not formatting:
            paragraph.add_run(processed_text)
            return

        current_pos = 0

        for fmt in formatting:
            # 添加格式前的普通文本
            if fmt['start'] > current_pos:
                paragraph.add_run(processed_text[current_pos:fmt['start']])

            # 创建格式化的run
            if fmt['type'] == 'bold':
                clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']])
                run = paragraph.add_run(clean_text)
                run.bold = True

            elif fmt['type'] == 'italic':
                clean_text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)|_(.+?)_', r'\1\2',
                                    processed_text[fmt['start']:fmt['end']])
                run = paragraph.add_run(clean_text)
                run.italic = True

            elif fmt['type'] == 'code':
                clean_text = re.sub(r'`([^`]+)`', r'\1', processed_text[fmt['start']:fmt['end']])
                run = paragraph.add_run(clean_text)
                run.font.name = 'Courier New'
                run.font.size = Pt(10)

            elif fmt['type'] == 'strikethrough':
                clean_text = re.sub(r'~~(.+?)~~', r'\1', processed_text[fmt['start']:fmt['end']])
                run = paragraph.add_run(clean_text)
                run.font.strike = True

            elif fmt['type'] == 'link':
                # 对于链接，只显示链接文本
                run = paragraph.add_run(fmt['text'])
                run.font.color.rgb = RGBColor(0, 0, 255)  # 蓝色
                run.underline = True

            current_pos = fmt['end']

        # 添加剩余的普通文本
        if current_pos < len(processed_text):
            paragraph.add_run(processed_text[current_pos:])

    def _add_code_block(self, doc: Document, content: str, language: str) -> None:
        """
        添加代码块
        
        Args:
            doc: DOCX文档对象
            content: 代码内容
            language: 编程语言
        """
        para = doc.add_paragraph(style='No Spacing')
        run = para.add_run(content)
        run.font.name = 'Courier New'
        run.font.size = Pt(10)
        
        # 设置背景色（如果支持）
        try:
            para.paragraph_format.space_before = Pt(6)
            para.paragraph_format.space_after = Pt(6)
        except:
            pass

    def _add_table_to_doc(self, doc: Document, rows: List[List[str]]) -> None:
        """
        添加表格到文档
        
        Args:
            doc: DOCX文档对象
            rows: 表格行数据
        """
        if not rows:
            return

        table = doc.add_table(rows=len(rows), cols=len(rows[0]))
        table.style = 'Table Grid'

        for i, row_data in enumerate(rows):
            row_cells = table.rows[i].cells
            for j, cell_data in enumerate(row_data):
                if j < len(row_cells):
                    processed_text = text_processor.process_text_content(cell_data)
                    row_cells[j].text = processed_text

    def _add_horizontal_rule(self, doc: Document) -> None:
        """
        在文档中添加横线
        
        Args:
            doc: DOCX文档对象
        """
        para = doc.add_paragraph()
        run = para.add_run()
        run.font.underline = True
        run.text = " " * 100  # 足够长的下划线作为横线
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER

    def _insert_section_image(self, doc: Document, image_files: List[str], 
                             image_index: int, image_count: int, output_path: str) -> int:
        """
        为章节插入图片
        
        Args:
            doc: DOCX文档对象
            image_files: 图片文件列表
            image_index: 当前图片索引
            image_count: 图片总数
            output_path: 输出文件路径
            
        Returns:
            int: 更新后的图片索引
        """
        if image_count > 0 and image_index < image_count:
            try:
                self._insert_image(doc, image_files[image_index], output_path)
                image_index += 1

                # 根据策略处理图片不足的情况
                if image_index >= image_count:
                    if config.image_strategy == "cycle":
                        image_index = 0
                    elif config.image_strategy == "truncate":
                        image_index = image_count
                    # repeat_last策略：保持当前索引-1，下次还用最后一张

            except Exception as e:
                # 插入失败时添加错误提示
                para = doc.add_paragraph()
                run = para.add_run(f"[图片插入失败: {str(e)}]")
                run.font.color.rgb = RGBColor(255, 0, 0)  # 红色

        return image_index

    def _insert_image(self, doc: Document, image_path: str, output_path: str) -> None:
        """
        插入图片到文档
        
        Args:
            doc: DOCX文档对象
            image_path: 图片文件路径
            output_path: 输出文件路径（用于临时文件）
        """
        try:
            # 处理图片
            img, width = ImageProcessor.process_image(image_path)

            temp_img_path = None
            if config.image_resize == "width":
                # 需要保存临时图片
                temp_dir = os.path.dirname(output_path)
                os.makedirs(temp_dir, exist_ok=True)
                temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png")
                img.save(temp_img_path)
                self.temp_files.append(temp_img_path)
                img_path = temp_img_path
            else:
                img_path = image_path

            # 创建段落并插入图片
            para = doc.add_paragraph()
            run = para.runs[0] if para.runs else para.add_run()
            run.add_picture(img_path, width=Inches(width))
            para.alignment = ImageProcessor.get_image_alignment()

        except Exception as e:
            raise Exception(f"插入图片失败: {str(e)}")

    def _add_disclaimer(self, doc: Document) -> None:
        """
        添加免责声明
        
        Args:
            doc: DOCX文档对象
        """
        doc.add_paragraph("---")
        para = doc.add_paragraph()
        disclaimer_text = text_processor.process_text_content(DISCLAIMER_TEXT)
        run = para.add_run(disclaimer_text)
        run.font.size = Pt(10)
        para.paragraph_format.line_spacing = 1.0

    def _cleanup_temp_files(self) -> None:
        """清理临时文件"""
        for temp_file in self.temp_files:
            try:
                if os.path.exists(temp_file):
                    os.remove(temp_file)
            except Exception as e:
                print(f"清理临时文件失败 {temp_file}: {e}")
        self.temp_files.clear()


# 创建全局DOCX生成器实例
docx_generator = DocxGenerator()


# 兼容旧接口的函数
def generate(sections: List[Dict[str, Any]], image_files: List[str], 
             output_path: str, progress_callback: Optional[Callable] = None) -> bool:
    """生成DOCX文档（兼容旧接口）"""
    return docx_generator.generate(sections, image_files, output_path, progress_callback)