""" DOCX文档生成模块 负责将解析后的Markdown结构转换为DOCX文档,包括文本格式化、图片插入和样式设置。 """ import os import re from typing import List, Dict, Any, Callable, Optional from docx import Document from docx.shared import Inches, Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.style import WD_STYLE_TYPE from config import config from text_processor import text_processor from image_processor import ImageProcessor from markdown_parser import MarkdownParser # 免责声明文本 DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`""" class DocxGenerator: """DOCX文档生成器类""" def __init__(self): """初始化DOCX生成器""" self.temp_files = [] # 跟踪临时文件以便清理 def generate(self, sections: List[Dict[str, Any]], image_files: List[str], output_path: str, progress_callback: Optional[Callable] = None) -> bool: """ 生成DOCX文档 Args: sections: 解析后的文档章节列表 image_files: 图片文件路径列表 output_path: 输出文件路径 progress_callback: 进度回调函数 Returns: bool: 是否生成成功 Raises: Exception: 生成失败时 """ try: doc = Document() self._setup_document_styles(doc) total_sections = len(sections) image_index = 0 image_count = len(image_files) for i, section in enumerate(sections): if progress_callback: progress = int((i / total_sections) * 100) section_title = section['content'][:30] + "..." if len(section['content']) > 30 else section['content'] progress_callback(progress, f"处理章节: {section_title}") # 添加章节内容 image_index = self._add_section_to_doc(doc, section, image_files, image_index, image_count, output_path) # 添加免责声明 if config.add_disclaimer: self._add_disclaimer(doc) # 保存文档 doc.save(output_path) if progress_callback: progress_callback(100, "转换完成!") return True except Exception as e: raise Exception(f"生成DOCX失败: {str(e)}") finally: # 清理临时文件 self._cleanup_temp_files() def _setup_document_styles(self, doc: Document) -> None: """ 设置文档样式 Args: doc: DOCX文档对象 """ try: # 设置默认字体和行距 styles = doc.styles # 设置正文样式 if 'Normal' in styles: normal_style = styles['Normal'] if config.line_spacing > 0: normal_style.paragraph_format.line_spacing = config.line_spacing except Exception as e: print(f"设置文档样式时出错: {e}") def _add_section_to_doc(self, doc: Document, section: Dict[str, Any], image_files: List[str], image_index: int, image_count: int, output_path: str) -> int: """ 添加章节内容到文档 Args: doc: DOCX文档对象 section: 章节数据 image_files: 图片文件列表 image_index: 当前图片索引 image_count: 图片总数 output_path: 输出文件路径(用于临时文件) Returns: int: 更新后的图片索引 """ # 添加章节标题 if section['level'] > 0 and section['level'] <= config.title_levels: heading_text = text_processor.process_text_content(section['content']) para = doc.add_heading(level=section['level']) self._apply_inline_formatting(para, heading_text) elif section['content'] != '前置内容': heading_text = text_processor.process_text_content(section['content']) para = doc.add_paragraph() run = para.add_run(heading_text) run.font.size = Pt(14) run.font.bold = True para.space_after = Pt(12) # 处理章节中的元素 elements = section.get('elements', []) if not elements: return image_index # 处理第一个非空元素后插入图片 first_content_added = False for element in elements: # 添加元素到文档 self._add_element_to_doc(doc, element) # 在第一个内容元素后插入图片 if not first_content_added and element['type'] not in ['empty']: first_content_added = True image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path) return image_index def _add_element_to_doc(self, doc: Document, element: Dict[str, Any]) -> None: """ 将解析的元素添加到文档中 Args: doc: DOCX文档对象 element: 元素数据 """ element_type = element['type'] content = text_processor.process_text_content(element.get('content', '')) if element_type == 'paragraph': self._add_formatted_paragraph(doc, content) elif element_type == 'unordered_list': para = doc.add_paragraph(style='List Bullet') self._apply_inline_formatting(para, content) elif element_type == 'ordered_list': para = doc.add_paragraph(style='List Number') self._apply_inline_formatting(para, content) elif element_type == 'blockquote': para = doc.add_paragraph(style='Quote') self._apply_inline_formatting(para, content) elif element_type == 'code_block': self._add_code_block(doc, element.get('content', ''), element.get('language', '')) elif element_type == 'table': self._add_table_to_doc(doc, element.get('rows', [])) elif element_type == 'horizontal_rule': self._add_horizontal_rule(doc) elif element_type == 'empty': doc.add_paragraph() def _add_formatted_paragraph(self, doc: Document, content: str) -> None: """ 添加带格式的段落 Args: doc: DOCX文档对象 content: 段落内容 """ if not content or not content.strip(): doc.add_paragraph() return para = doc.add_paragraph() self._apply_inline_formatting(para, content) if config.line_spacing > 0: para.paragraph_format.line_spacing = config.line_spacing def _apply_inline_formatting(self, paragraph, text: str) -> None: """ 应用行内格式到段落 Args: paragraph: DOCX段落对象 text: 要格式化的文本 """ # 首先处理文字内容(已在调用前处理) processed_text = text # 提取格式信息 formatting = MarkdownParser.extract_inline_formatting(processed_text) # 如果没有格式,直接添加文本 if not formatting: paragraph.add_run(processed_text) return current_pos = 0 for fmt in formatting: # 添加格式前的普通文本 if fmt['start'] > current_pos: paragraph.add_run(processed_text[current_pos:fmt['start']]) # 创建格式化的run if fmt['type'] == 'bold': clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']]) run = paragraph.add_run(clean_text) run.bold = True elif fmt['type'] == 'italic': clean_text = re.sub(r'(? None: """ 添加代码块 Args: doc: DOCX文档对象 content: 代码内容 language: 编程语言 """ para = doc.add_paragraph(style='No Spacing') run = para.add_run(content) run.font.name = 'Courier New' run.font.size = Pt(10) # 设置背景色(如果支持) try: para.paragraph_format.space_before = Pt(6) para.paragraph_format.space_after = Pt(6) except: pass def _add_table_to_doc(self, doc: Document, rows: List[List[str]]) -> None: """ 添加表格到文档 Args: doc: DOCX文档对象 rows: 表格行数据 """ if not rows: return table = doc.add_table(rows=len(rows), cols=len(rows[0])) table.style = 'Table Grid' for i, row_data in enumerate(rows): row_cells = table.rows[i].cells for j, cell_data in enumerate(row_data): if j < len(row_cells): processed_text = text_processor.process_text_content(cell_data) row_cells[j].text = processed_text def _add_horizontal_rule(self, doc: Document) -> None: """ 在文档中添加横线 Args: doc: DOCX文档对象 """ para = doc.add_paragraph() run = para.add_run() run.font.underline = True run.text = " " * 100 # 足够长的下划线作为横线 para.alignment = WD_ALIGN_PARAGRAPH.CENTER def _insert_section_image(self, doc: Document, image_files: List[str], image_index: int, image_count: int, output_path: str) -> int: """ 为章节插入图片 Args: doc: DOCX文档对象 image_files: 图片文件列表 image_index: 当前图片索引 image_count: 图片总数 output_path: 输出文件路径 Returns: int: 更新后的图片索引 """ if image_count > 0 and image_index < image_count: try: self._insert_image(doc, image_files[image_index], output_path) image_index += 1 # 根据策略处理图片不足的情况 if image_index >= image_count: if config.image_strategy == "cycle": image_index = 0 elif config.image_strategy == "truncate": image_index = image_count # repeat_last策略:保持当前索引-1,下次还用最后一张 except Exception as e: # 插入失败时添加错误提示 para = doc.add_paragraph() run = para.add_run(f"[图片插入失败: {str(e)}]") run.font.color.rgb = RGBColor(255, 0, 0) # 红色 return image_index def _insert_image(self, doc: Document, image_path: str, output_path: str) -> None: """ 插入图片到文档 Args: doc: DOCX文档对象 image_path: 图片文件路径 output_path: 输出文件路径(用于临时文件) """ try: # 处理图片 img, width = ImageProcessor.process_image(image_path) temp_img_path = None if config.image_resize == "width": # 需要保存临时图片 temp_dir = os.path.dirname(output_path) os.makedirs(temp_dir, exist_ok=True) temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png") img.save(temp_img_path) self.temp_files.append(temp_img_path) img_path = temp_img_path else: img_path = image_path # 创建段落并插入图片 para = doc.add_paragraph() run = para.runs[0] if para.runs else para.add_run() run.add_picture(img_path, width=Inches(width)) para.alignment = ImageProcessor.get_image_alignment() except Exception as e: raise Exception(f"插入图片失败: {str(e)}") def _add_disclaimer(self, doc: Document) -> None: """ 添加免责声明 Args: doc: DOCX文档对象 """ doc.add_paragraph("---") para = doc.add_paragraph() disclaimer_text = text_processor.process_text_content(DISCLAIMER_TEXT) run = para.add_run(disclaimer_text) run.font.size = Pt(10) para.paragraph_format.line_spacing = 1.0 def _cleanup_temp_files(self) -> None: """清理临时文件""" for temp_file in self.temp_files: try: if os.path.exists(temp_file): os.remove(temp_file) except Exception as e: print(f"清理临时文件失败 {temp_file}: {e}") self.temp_files.clear() # 创建全局DOCX生成器实例 docx_generator = DocxGenerator() # 兼容旧接口的函数 def generate(sections: List[Dict[str, Any]], image_files: List[str], output_path: str, progress_callback: Optional[Callable] = None) -> bool: """生成DOCX文档(兼容旧接口)""" return docx_generator.generate(sections, image_files, output_path, progress_callback)