428 lines
15 KiB
Python
428 lines
15 KiB
Python
|
|
"""
|
|||
|
|
DOCX文档生成模块
|
|||
|
|
|
|||
|
|
负责将解析后的Markdown结构转换为DOCX文档,包括文本格式化、图片插入和样式设置。
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
from typing import List, Dict, Any, Callable, Optional
|
|||
|
|
from docx import Document
|
|||
|
|
from docx.shared import Inches, Pt, RGBColor
|
|||
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|||
|
|
from docx.enum.style import WD_STYLE_TYPE
|
|||
|
|
|
|||
|
|
from config import config
|
|||
|
|
from text_processor import text_processor
|
|||
|
|
from image_processor import ImageProcessor
|
|||
|
|
from markdown_parser import MarkdownParser
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 免责声明文本
|
|||
|
|
DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
class DocxGenerator:
|
|||
|
|
"""DOCX文档生成器类"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
"""初始化DOCX生成器"""
|
|||
|
|
self.temp_files = [] # 跟踪临时文件以便清理
|
|||
|
|
|
|||
|
|
def generate(self, sections: List[Dict[str, Any]], image_files: List[str],
|
|||
|
|
output_path: str, progress_callback: Optional[Callable] = None) -> bool:
|
|||
|
|
"""
|
|||
|
|
生成DOCX文档
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
sections: 解析后的文档章节列表
|
|||
|
|
image_files: 图片文件路径列表
|
|||
|
|
output_path: 输出文件路径
|
|||
|
|
progress_callback: 进度回调函数
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
bool: 是否生成成功
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
Exception: 生成失败时
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
doc = Document()
|
|||
|
|
self._setup_document_styles(doc)
|
|||
|
|
|
|||
|
|
total_sections = len(sections)
|
|||
|
|
image_index = 0
|
|||
|
|
image_count = len(image_files)
|
|||
|
|
|
|||
|
|
for i, section in enumerate(sections):
|
|||
|
|
if progress_callback:
|
|||
|
|
progress = int((i / total_sections) * 100)
|
|||
|
|
section_title = section['content'][:30] + "..." if len(section['content']) > 30 else section['content']
|
|||
|
|
progress_callback(progress, f"处理章节: {section_title}")
|
|||
|
|
|
|||
|
|
# 添加章节内容
|
|||
|
|
image_index = self._add_section_to_doc(doc, section, image_files, image_index, image_count, output_path)
|
|||
|
|
|
|||
|
|
# 添加免责声明
|
|||
|
|
if config.add_disclaimer:
|
|||
|
|
self._add_disclaimer(doc)
|
|||
|
|
|
|||
|
|
# 保存文档
|
|||
|
|
doc.save(output_path)
|
|||
|
|
|
|||
|
|
if progress_callback:
|
|||
|
|
progress_callback(100, "转换完成!")
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise Exception(f"生成DOCX失败: {str(e)}")
|
|||
|
|
finally:
|
|||
|
|
# 清理临时文件
|
|||
|
|
self._cleanup_temp_files()
|
|||
|
|
|
|||
|
|
def _setup_document_styles(self, doc: Document) -> None:
|
|||
|
|
"""
|
|||
|
|
设置文档样式
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 设置默认字体和行距
|
|||
|
|
styles = doc.styles
|
|||
|
|
|
|||
|
|
# 设置正文样式
|
|||
|
|
if 'Normal' in styles:
|
|||
|
|
normal_style = styles['Normal']
|
|||
|
|
if config.line_spacing > 0:
|
|||
|
|
normal_style.paragraph_format.line_spacing = config.line_spacing
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"设置文档样式时出错: {e}")
|
|||
|
|
|
|||
|
|
def _add_section_to_doc(self, doc: Document, section: Dict[str, Any],
|
|||
|
|
image_files: List[str], image_index: int, image_count: int,
|
|||
|
|
output_path: str) -> int:
|
|||
|
|
"""
|
|||
|
|
添加章节内容到文档
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
section: 章节数据
|
|||
|
|
image_files: 图片文件列表
|
|||
|
|
image_index: 当前图片索引
|
|||
|
|
image_count: 图片总数
|
|||
|
|
output_path: 输出文件路径(用于临时文件)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
int: 更新后的图片索引
|
|||
|
|
"""
|
|||
|
|
# 添加章节标题
|
|||
|
|
if section['level'] > 0 and section['level'] <= config.title_levels:
|
|||
|
|
heading_text = text_processor.process_text_content(section['content'])
|
|||
|
|
para = doc.add_heading(level=section['level'])
|
|||
|
|
self._apply_inline_formatting(para, heading_text)
|
|||
|
|
elif section['content'] != '前置内容':
|
|||
|
|
heading_text = text_processor.process_text_content(section['content'])
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
run = para.add_run(heading_text)
|
|||
|
|
run.font.size = Pt(14)
|
|||
|
|
run.font.bold = True
|
|||
|
|
para.space_after = Pt(12)
|
|||
|
|
|
|||
|
|
# 处理章节中的元素
|
|||
|
|
elements = section.get('elements', [])
|
|||
|
|
if not elements:
|
|||
|
|
return image_index
|
|||
|
|
|
|||
|
|
# 处理第一个非空元素后插入图片
|
|||
|
|
first_content_added = False
|
|||
|
|
|
|||
|
|
for element in elements:
|
|||
|
|
# 添加元素到文档
|
|||
|
|
self._add_element_to_doc(doc, element)
|
|||
|
|
|
|||
|
|
# 在第一个内容元素后插入图片
|
|||
|
|
if not first_content_added and element['type'] not in ['empty']:
|
|||
|
|
first_content_added = True
|
|||
|
|
image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)
|
|||
|
|
|
|||
|
|
return image_index
|
|||
|
|
|
|||
|
|
def _add_element_to_doc(self, doc: Document, element: Dict[str, Any]) -> None:
|
|||
|
|
"""
|
|||
|
|
将解析的元素添加到文档中
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
element: 元素数据
|
|||
|
|
"""
|
|||
|
|
element_type = element['type']
|
|||
|
|
content = text_processor.process_text_content(element.get('content', ''))
|
|||
|
|
|
|||
|
|
if element_type == 'paragraph':
|
|||
|
|
self._add_formatted_paragraph(doc, content)
|
|||
|
|
|
|||
|
|
elif element_type == 'unordered_list':
|
|||
|
|
para = doc.add_paragraph(style='List Bullet')
|
|||
|
|
self._apply_inline_formatting(para, content)
|
|||
|
|
|
|||
|
|
elif element_type == 'ordered_list':
|
|||
|
|
para = doc.add_paragraph(style='List Number')
|
|||
|
|
self._apply_inline_formatting(para, content)
|
|||
|
|
|
|||
|
|
elif element_type == 'blockquote':
|
|||
|
|
para = doc.add_paragraph(style='Quote')
|
|||
|
|
self._apply_inline_formatting(para, content)
|
|||
|
|
|
|||
|
|
elif element_type == 'code_block':
|
|||
|
|
self._add_code_block(doc, element.get('content', ''), element.get('language', ''))
|
|||
|
|
|
|||
|
|
elif element_type == 'table':
|
|||
|
|
self._add_table_to_doc(doc, element.get('rows', []))
|
|||
|
|
|
|||
|
|
elif element_type == 'horizontal_rule':
|
|||
|
|
self._add_horizontal_rule(doc)
|
|||
|
|
|
|||
|
|
elif element_type == 'empty':
|
|||
|
|
doc.add_paragraph()
|
|||
|
|
|
|||
|
|
def _add_formatted_paragraph(self, doc: Document, content: str) -> None:
|
|||
|
|
"""
|
|||
|
|
添加带格式的段落
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
content: 段落内容
|
|||
|
|
"""
|
|||
|
|
if not content or not content.strip():
|
|||
|
|
doc.add_paragraph()
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
self._apply_inline_formatting(para, content)
|
|||
|
|
|
|||
|
|
if config.line_spacing > 0:
|
|||
|
|
para.paragraph_format.line_spacing = config.line_spacing
|
|||
|
|
|
|||
|
|
def _apply_inline_formatting(self, paragraph, text: str) -> None:
|
|||
|
|
"""
|
|||
|
|
应用行内格式到段落
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
paragraph: DOCX段落对象
|
|||
|
|
text: 要格式化的文本
|
|||
|
|
"""
|
|||
|
|
# 首先处理文字内容(已在调用前处理)
|
|||
|
|
processed_text = text
|
|||
|
|
|
|||
|
|
# 提取格式信息
|
|||
|
|
formatting = MarkdownParser.extract_inline_formatting(processed_text)
|
|||
|
|
|
|||
|
|
# 如果没有格式,直接添加文本
|
|||
|
|
if not formatting:
|
|||
|
|
paragraph.add_run(processed_text)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
current_pos = 0
|
|||
|
|
|
|||
|
|
for fmt in formatting:
|
|||
|
|
# 添加格式前的普通文本
|
|||
|
|
if fmt['start'] > current_pos:
|
|||
|
|
paragraph.add_run(processed_text[current_pos:fmt['start']])
|
|||
|
|
|
|||
|
|
# 创建格式化的run
|
|||
|
|
if fmt['type'] == 'bold':
|
|||
|
|
clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']])
|
|||
|
|
run = paragraph.add_run(clean_text)
|
|||
|
|
run.bold = True
|
|||
|
|
|
|||
|
|
elif fmt['type'] == 'italic':
|
|||
|
|
clean_text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)|_(.+?)_', r'\1\2',
|
|||
|
|
processed_text[fmt['start']:fmt['end']])
|
|||
|
|
run = paragraph.add_run(clean_text)
|
|||
|
|
run.italic = True
|
|||
|
|
|
|||
|
|
elif fmt['type'] == 'code':
|
|||
|
|
clean_text = re.sub(r'`([^`]+)`', r'\1', processed_text[fmt['start']:fmt['end']])
|
|||
|
|
run = paragraph.add_run(clean_text)
|
|||
|
|
run.font.name = 'Courier New'
|
|||
|
|
run.font.size = Pt(10)
|
|||
|
|
|
|||
|
|
elif fmt['type'] == 'strikethrough':
|
|||
|
|
clean_text = re.sub(r'~~(.+?)~~', r'\1', processed_text[fmt['start']:fmt['end']])
|
|||
|
|
run = paragraph.add_run(clean_text)
|
|||
|
|
run.font.strike = True
|
|||
|
|
|
|||
|
|
elif fmt['type'] == 'link':
|
|||
|
|
# 对于链接,只显示链接文本
|
|||
|
|
run = paragraph.add_run(fmt['text'])
|
|||
|
|
run.font.color.rgb = RGBColor(0, 0, 255) # 蓝色
|
|||
|
|
run.underline = True
|
|||
|
|
|
|||
|
|
current_pos = fmt['end']
|
|||
|
|
|
|||
|
|
# 添加剩余的普通文本
|
|||
|
|
if current_pos < len(processed_text):
|
|||
|
|
paragraph.add_run(processed_text[current_pos:])
|
|||
|
|
|
|||
|
|
def _add_code_block(self, doc: Document, content: str, language: str) -> None:
|
|||
|
|
"""
|
|||
|
|
添加代码块
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
content: 代码内容
|
|||
|
|
language: 编程语言
|
|||
|
|
"""
|
|||
|
|
para = doc.add_paragraph(style='No Spacing')
|
|||
|
|
run = para.add_run(content)
|
|||
|
|
run.font.name = 'Courier New'
|
|||
|
|
run.font.size = Pt(10)
|
|||
|
|
|
|||
|
|
# 设置背景色(如果支持)
|
|||
|
|
try:
|
|||
|
|
para.paragraph_format.space_before = Pt(6)
|
|||
|
|
para.paragraph_format.space_after = Pt(6)
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
def _add_table_to_doc(self, doc: Document, rows: List[List[str]]) -> None:
|
|||
|
|
"""
|
|||
|
|
添加表格到文档
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
rows: 表格行数据
|
|||
|
|
"""
|
|||
|
|
if not rows:
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
|
|||
|
|
table.style = 'Table Grid'
|
|||
|
|
|
|||
|
|
for i, row_data in enumerate(rows):
|
|||
|
|
row_cells = table.rows[i].cells
|
|||
|
|
for j, cell_data in enumerate(row_data):
|
|||
|
|
if j < len(row_cells):
|
|||
|
|
processed_text = text_processor.process_text_content(cell_data)
|
|||
|
|
row_cells[j].text = processed_text
|
|||
|
|
|
|||
|
|
def _add_horizontal_rule(self, doc: Document) -> None:
|
|||
|
|
"""
|
|||
|
|
在文档中添加横线
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
"""
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
run = para.add_run()
|
|||
|
|
run.font.underline = True
|
|||
|
|
run.text = " " * 100 # 足够长的下划线作为横线
|
|||
|
|
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|||
|
|
|
|||
|
|
def _insert_section_image(self, doc: Document, image_files: List[str],
|
|||
|
|
image_index: int, image_count: int, output_path: str) -> int:
|
|||
|
|
"""
|
|||
|
|
为章节插入图片
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
image_files: 图片文件列表
|
|||
|
|
image_index: 当前图片索引
|
|||
|
|
image_count: 图片总数
|
|||
|
|
output_path: 输出文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
int: 更新后的图片索引
|
|||
|
|
"""
|
|||
|
|
if image_count > 0 and image_index < image_count:
|
|||
|
|
try:
|
|||
|
|
self._insert_image(doc, image_files[image_index], output_path)
|
|||
|
|
image_index += 1
|
|||
|
|
|
|||
|
|
# 根据策略处理图片不足的情况
|
|||
|
|
if image_index >= image_count:
|
|||
|
|
if config.image_strategy == "cycle":
|
|||
|
|
image_index = 0
|
|||
|
|
elif config.image_strategy == "truncate":
|
|||
|
|
image_index = image_count
|
|||
|
|
# repeat_last策略:保持当前索引-1,下次还用最后一张
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
# 插入失败时添加错误提示
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
run = para.add_run(f"[图片插入失败: {str(e)}]")
|
|||
|
|
run.font.color.rgb = RGBColor(255, 0, 0) # 红色
|
|||
|
|
|
|||
|
|
return image_index
|
|||
|
|
|
|||
|
|
def _insert_image(self, doc: Document, image_path: str, output_path: str) -> None:
|
|||
|
|
"""
|
|||
|
|
插入图片到文档
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
image_path: 图片文件路径
|
|||
|
|
output_path: 输出文件路径(用于临时文件)
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 处理图片
|
|||
|
|
img, width = ImageProcessor.process_image(image_path)
|
|||
|
|
|
|||
|
|
temp_img_path = None
|
|||
|
|
if config.image_resize == "width":
|
|||
|
|
# 需要保存临时图片
|
|||
|
|
temp_dir = os.path.dirname(output_path)
|
|||
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|||
|
|
temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png")
|
|||
|
|
img.save(temp_img_path)
|
|||
|
|
self.temp_files.append(temp_img_path)
|
|||
|
|
img_path = temp_img_path
|
|||
|
|
else:
|
|||
|
|
img_path = image_path
|
|||
|
|
|
|||
|
|
# 创建段落并插入图片
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
run = para.runs[0] if para.runs else para.add_run()
|
|||
|
|
run.add_picture(img_path, width=Inches(width))
|
|||
|
|
para.alignment = ImageProcessor.get_image_alignment()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise Exception(f"插入图片失败: {str(e)}")
|
|||
|
|
|
|||
|
|
def _add_disclaimer(self, doc: Document) -> None:
|
|||
|
|
"""
|
|||
|
|
添加免责声明
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
doc: DOCX文档对象
|
|||
|
|
"""
|
|||
|
|
doc.add_paragraph("---")
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
disclaimer_text = text_processor.process_text_content(DISCLAIMER_TEXT)
|
|||
|
|
run = para.add_run(disclaimer_text)
|
|||
|
|
run.font.size = Pt(10)
|
|||
|
|
para.paragraph_format.line_spacing = 1.0
|
|||
|
|
|
|||
|
|
def _cleanup_temp_files(self) -> None:
|
|||
|
|
"""清理临时文件"""
|
|||
|
|
for temp_file in self.temp_files:
|
|||
|
|
try:
|
|||
|
|
if os.path.exists(temp_file):
|
|||
|
|
os.remove(temp_file)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"清理临时文件失败 {temp_file}: {e}")
|
|||
|
|
self.temp_files.clear()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 创建全局DOCX生成器实例
|
|||
|
|
docx_generator = DocxGenerator()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 兼容旧接口的函数
|
|||
|
|
def generate(sections: List[Dict[str, Any]], image_files: List[str],
|
|||
|
|
output_path: str, progress_callback: Optional[Callable] = None) -> bool:
|
|||
|
|
"""生成DOCX文档(兼容旧接口)"""
|
|||
|
|
return docx_generator.generate(sections, image_files, output_path, progress_callback)
|