""" DOCX文档生成模块 负责将解析后的Markdown结构转换为DOCX文档,包括文本格式化、图片插入和样式设置。 """ import os import re from typing import List, Dict, Any, Callable, Optional from docx import Document from docx.document import Document as DocxDocument from docx.shared import Inches, Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.style import WD_STYLE_TYPE from config import config from text_processor import text_processor from image_processor import ImageProcessor from markdown_parser import MarkdownParser from style_manager import style_manager # 免责声明文本 DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`""" class DocxGenerator: """DOCX文档生成器类""" def __init__(self): """初始化DOCX生成器""" self.temp_files = [] # 跟踪临时文件以便清理 self.current_document_style = None # 当前使用的文档样式 def generate(self, sections: List[Dict[str, Any]], image_files: List[str], output_path: str, progress_callback: Optional[Callable] = None) -> bool: """ 生成DOCX文档 Args: sections: 解析后的文档章节列表 image_files: 图片文件路径列表 output_path: 输出文件路径 progress_callback: 进度回调函数 Returns: bool: 是否生成成功 Raises: Exception: 生成失败时 """ try: doc = Document() self._setup_document_styles(doc) total_sections = len(sections) image_index = 0 image_count = len(image_files) for i, section in enumerate(sections): if progress_callback: progress = int((i / total_sections) * 100) section_title = section['content'][:30] + "..." if len(section['content']) > 30 else section['content'] progress_callback(progress, f"处理章节: {section_title}") # 添加章节内容 image_index = self._add_section_to_doc(doc, section, image_files, image_index, image_count, output_path) # 添加免责声明 if config.add_disclaimer: self._add_disclaimer(doc) # 保存文档 doc.save(output_path) if progress_callback: progress_callback(100, "转换完成!") return True except Exception as e: raise Exception(f"生成DOCX失败: {str(e)}") finally: # 清理临时文件 self._cleanup_temp_files() def _setup_document_styles(self, doc) -> None: """ 设置文档样式 Args: doc: DOCX文档对象 """ try: # 获取当前选中的样式 current_style = style_manager.get_style(config.current_style) if not current_style: print(f"警告: 找不到样式 '{config.current_style}',使用默认样式") return self.current_document_style = current_style print(f"应用文档样式: {current_style.name}") except Exception as e: print(f"设置文档样式时出错: {e}") def _add_section_to_doc(self, doc: DocxDocument, section: Dict[str, Any], image_files: List[str], image_index: int, image_count: int, output_path: str) -> int: """ 添加章节内容到文档 Args: doc: DOCX文档对象 section: 章节数据 image_files: 图片文件列表 image_index: 当前图片索引 image_count: 图片总数 output_path: 输出文件路径(用于临时文件) Returns: int: 更新后的图片索引 """ # 添加章节标题 if section['level'] > 0 and section['level'] <= config.title_levels: heading_text = text_processor.process_text_content(section['content']) para = doc.add_heading(level=section['level']) # 清空默认内容,应用自定义样式 para.clear() run = para.add_run(heading_text) # 应用标题样式 if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.heading_styles: if section['level'] in self.current_document_style.heading_styles: heading_style = self.current_document_style.heading_styles[section['level']] if heading_style.font: run.font.name = heading_style.font.name run.font.size = Pt(heading_style.font.size) run.font.bold = heading_style.font.bold run.font.italic = heading_style.font.italic if heading_style.font.color != "#000000": run.font.color.rgb = RGBColor.from_string(heading_style.font.color.replace('#', '')) if heading_style.paragraph: para_style = heading_style.paragraph if para_style.line_spacing > 0: para.paragraph_format.line_spacing = para_style.line_spacing if para_style.space_before > 0: para.paragraph_format.space_before = Pt(para_style.space_before) if para_style.space_after > 0: para.paragraph_format.space_after = Pt(para_style.space_after) if para_style.first_line_indent > 0: para.paragraph_format.first_line_indent = Pt(para_style.first_line_indent * 12) # 设置对齐方式 if para_style.alignment == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER elif para_style.alignment == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT elif para_style.alignment == "justify": para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT else: # 默认样式 run.font.size = Pt(18 - section['level'] * 2 if section['level'] <= 6 else 10) run.font.bold = True self._apply_inline_formatting(para, heading_text) elif section['content'] != '前置内容': heading_text = text_processor.process_text_content(section['content']) para = doc.add_paragraph() run = para.add_run(heading_text) # 应用样式设置 if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.heading_styles: if section['level'] in self.current_document_style.heading_styles: heading_style = self.current_document_style.heading_styles[section['level']] if heading_style.font: run.font.name = heading_style.font.name run.font.size = Pt(heading_style.font.size) run.font.bold = heading_style.font.bold run.font.italic = heading_style.font.italic if heading_style.font.color != "#000000": run.font.color.rgb = RGBColor.from_string(heading_style.font.color.replace('#', '')) else: run.font.size = Pt(14) run.font.bold = True para.paragraph_format.space_after = Pt(12) # 处理章节中的元素 elements = section.get('elements', []) if not elements: return image_index # 处理第一个非空元素后插入图片 first_content_added = False for element in elements: # 添加元素到文档 self._add_element_to_doc(doc, element) # 在第一个内容元素后插入图片 if not first_content_added and element['type'] not in ['empty']: first_content_added = True image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path) return image_index def _add_element_to_doc(self, doc: DocxDocument, element: Dict[str, Any]) -> None: """ 将解析的元素添加到文档中 Args: doc: DOCX文档对象 element: 元素数据 """ element_type = element['type'] content = text_processor.process_text_content(element.get('content', '')) if element_type == 'paragraph': self._add_formatted_paragraph(doc, content) elif element_type == 'unordered_list': para = doc.add_paragraph(style='List Bullet') self._apply_inline_formatting(para, content) # 应用列表样式 if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.unordered_list: list_style = self.current_document_style.unordered_list if list_style.paragraph: if list_style.paragraph.space_before > 0: para.paragraph_format.space_before = Pt(list_style.paragraph.space_before) if list_style.paragraph.space_after > 0: para.paragraph_format.space_after = Pt(list_style.paragraph.space_after) elif element_type == 'ordered_list': para = doc.add_paragraph(style='List Number') self._apply_inline_formatting(para, content) # 应用列表样式 if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.ordered_list: list_style = self.current_document_style.ordered_list if list_style.paragraph: if list_style.paragraph.space_before > 0: para.paragraph_format.space_before = Pt(list_style.paragraph.space_before) if list_style.paragraph.space_after > 0: para.paragraph_format.space_after = Pt(list_style.paragraph.space_after) elif element_type == 'blockquote': para = doc.add_paragraph(style='Quote') self._apply_inline_formatting(para, content) # 应用引用样式 if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.quote_block: quote_style = self.current_document_style.quote_block if quote_style.paragraph: if quote_style.paragraph.line_spacing > 0: para.paragraph_format.line_spacing = quote_style.paragraph.line_spacing if quote_style.paragraph.space_before > 0: para.paragraph_format.space_before = Pt(quote_style.paragraph.space_before) if quote_style.paragraph.space_after > 0: para.paragraph_format.space_after = Pt(quote_style.paragraph.space_after) if quote_style.paragraph.first_line_indent > 0: para.paragraph_format.first_line_indent = Pt(quote_style.paragraph.first_line_indent * 12) # 设置对齐方式 if quote_style.paragraph.alignment == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER elif quote_style.paragraph.alignment == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT elif quote_style.paragraph.alignment == "justify": para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY elif element_type == 'code_block': self._add_code_block(doc, element.get('content', ''), element.get('language', '')) elif element_type == 'table': self._add_table_to_doc(doc, element.get('rows', [])) elif element_type == 'horizontal_rule': self._add_horizontal_rule(doc) elif element_type == 'empty': doc.add_paragraph() def _add_formatted_paragraph(self, doc: DocxDocument, content: str) -> None: """ 添加带格式的段落 Args: doc: DOCX文档对象 content: 段落内容 """ if not content or not content.strip(): doc.add_paragraph() return para = doc.add_paragraph() self._apply_inline_formatting(para, content) # 应用样式中的段落格式 if hasattr(self, 'current_document_style') and self.current_document_style: if self.current_document_style.body_paragraph: body_para = self.current_document_style.body_paragraph if body_para.line_spacing > 0: para.paragraph_format.line_spacing = body_para.line_spacing if body_para.space_before > 0: para.paragraph_format.space_before = Pt(body_para.space_before) if body_para.space_after > 0: para.paragraph_format.space_after = Pt(body_para.space_after) if body_para.first_line_indent > 0: para.paragraph_format.first_line_indent = Pt(body_para.first_line_indent * 12) # 字符转磅 # 设置对齐方式 if body_para.alignment == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER elif body_para.alignment == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT elif body_para.alignment == "justify": para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT elif config.line_spacing > 0: para.paragraph_format.line_spacing = config.line_spacing def _apply_inline_formatting(self, paragraph, text: str) -> None: """ 应用行内格式到段落 Args: paragraph: DOCX段落对象 text: 要格式化的文本 """ # 首先处理文字内容(已在调用前处理) processed_text = text # 提取格式信息 formatting = MarkdownParser.extract_inline_formatting(processed_text) # 如果没有格式,直接添加文本 if not formatting: run = paragraph.add_run(processed_text) self._apply_body_font_style(run) return current_pos = 0 for fmt in formatting: # 添加格式前的普通文本 if fmt['start'] > current_pos: run = paragraph.add_run(processed_text[current_pos:fmt['start']]) self._apply_body_font_style(run) # 创建格式化的run if fmt['type'] == 'bold': clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']]) run = paragraph.add_run(clean_text) self._apply_body_font_style(run) run.bold = True elif fmt['type'] == 'italic': clean_text = re.sub(r'(? None: """ 应用正文字体样式到run Args: run: DOCX run对象 """ if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.body_font: body_font = self.current_document_style.body_font run.font.name = body_font.name run.font.size = Pt(body_font.size) run.font.bold = body_font.bold run.font.italic = body_font.italic if body_font.color != "#000000": run.font.color.rgb = RGBColor.from_string(body_font.color.replace('#', '')) def _add_code_block(self, doc: DocxDocument, content: str, language: str) -> None: """ 添加代码块 Args: doc: DOCX文档对象 content: 代码内容 language: 编程语言 """ para = doc.add_paragraph(style='No Spacing') run = para.add_run(content) # 应用代码块样式 if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.code_block: code_style = self.current_document_style.code_block if code_style.font: run.font.name = code_style.font.name run.font.size = Pt(code_style.font.size) run.font.bold = code_style.font.bold run.font.italic = code_style.font.italic if code_style.font.color != "#000000": run.font.color.rgb = RGBColor.from_string(code_style.font.color.replace('#', '')) if code_style.paragraph: para_style = code_style.paragraph if para_style.space_before > 0: para.paragraph_format.space_before = Pt(para_style.space_before) if para_style.space_after > 0: para.paragraph_format.space_after = Pt(para_style.space_after) else: # 默认样式 run.font.name = 'Courier New' run.font.size = Pt(10) para.paragraph_format.space_before = Pt(6) para.paragraph_format.space_after = Pt(6) def _add_table_to_doc(self, doc: DocxDocument, rows: List[List[str]]) -> None: """ 添加表格到文档 Args: doc: DOCX文档对象 rows: 表格行数据 """ if not rows: return table = doc.add_table(rows=len(rows), cols=len(rows[0])) table.style = 'Table Grid' for i, row_data in enumerate(rows): row_cells = table.rows[i].cells for j, cell_data in enumerate(row_data): if j < len(row_cells): processed_text = text_processor.process_text_content(cell_data) cell_para = row_cells[j].paragraphs[0] cell_para.clear() run = cell_para.add_run(processed_text) # 应用表格样式 if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.table_style: table_style = self.current_document_style.table_style if table_style.font: run.font.name = table_style.font.name run.font.size = Pt(table_style.font.size) run.font.bold = table_style.font.bold run.font.italic = table_style.font.italic if table_style.font.color != "#000000": run.font.color.rgb = RGBColor.from_string(table_style.font.color.replace('#', '')) if table_style.paragraph: para_style = table_style.paragraph if para_style.space_before > 0: cell_para.paragraph_format.space_before = Pt(para_style.space_before) if para_style.space_after > 0: cell_para.paragraph_format.space_after = Pt(para_style.space_after) def _add_horizontal_rule(self, doc: DocxDocument) -> None: """ 在文档中添加横线 Args: doc: DOCX文档对象 """ para = doc.add_paragraph() run = para.add_run() run.font.underline = True run.text = " " * 100 # 足够长的下划线作为横线 para.alignment = WD_ALIGN_PARAGRAPH.CENTER def _insert_section_image(self, doc: DocxDocument, image_files: List[str], image_index: int, image_count: int, output_path: str) -> int: """ 为章节插入图片 Args: doc: DOCX文档对象 image_files: 图片文件列表 image_index: 当前图片索引 image_count: 图片总数 output_path: 输出文件路径 Returns: int: 更新后的图片索引 """ if image_count > 0 and image_index < image_count: try: self._insert_image(doc, image_files[image_index], output_path) image_index += 1 # 根据策略处理图片不足的情况 if image_index >= image_count: if config.image_strategy == "cycle": image_index = 0 elif config.image_strategy == "truncate": image_index = image_count # repeat_last策略:保持当前索引-1,下次还用最后一张 except Exception as e: # 插入失败时添加错误提示 para = doc.add_paragraph() run = para.add_run(f"[图片插入失败: {str(e)}]") run.font.color.rgb = RGBColor(255, 0, 0) # 红色 return image_index def _insert_image(self, doc: DocxDocument, image_path: str, output_path: str) -> None: """ 插入图片到文档 Args: doc: DOCX文档对象 image_path: 图片文件路径 output_path: 输出文件路径(用于临时文件) """ try: # 处理图片 img, width = ImageProcessor.process_image(image_path) temp_img_path = None if config.image_resize == "width": # 需要保存临时图片 temp_dir = os.path.dirname(output_path) os.makedirs(temp_dir, exist_ok=True) temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png") img.save(temp_img_path) self.temp_files.append(temp_img_path) img_path = temp_img_path else: img_path = image_path # 创建段落并插入图片 para = doc.add_paragraph() run = para.runs[0] if para.runs else para.add_run() run.add_picture(img_path, width=Inches(width)) para.alignment = ImageProcessor.get_image_alignment() except Exception as e: raise Exception(f"插入图片失败: {str(e)}") def _add_disclaimer(self, doc: DocxDocument) -> None: """ 添加免责声明 Args: doc: DOCX文档对象 """ doc.add_paragraph("---") para = doc.add_paragraph() disclaimer_text = text_processor.process_text_content(DISCLAIMER_TEXT) run = para.add_run(disclaimer_text) run.font.size = Pt(10) para.paragraph_format.line_spacing = 1.0 def _cleanup_temp_files(self) -> None: """清理临时文件""" for temp_file in self.temp_files: try: if os.path.exists(temp_file): os.remove(temp_file) except Exception as e: print(f"清理临时文件失败 {temp_file}: {e}") self.temp_files.clear() # 创建全局DOCX生成器实例 docx_generator = DocxGenerator() # 兼容旧接口的函数 def generate(sections: List[Dict[str, Any]], image_files: List[str], output_path: str, progress_callback: Optional[Callable] = None) -> bool: """生成DOCX文档(兼容旧接口)""" return docx_generator.generate(sections, image_files, output_path, progress_callback)