TxT2Docx/docx_generator.py

611 lines
26 KiB
Python
Raw Normal View History

2025-09-21 19:01:40 +08:00
"""
DOCX文档生成模块
负责将解析后的Markdown结构转换为DOCX文档包括文本格式化图片插入和样式设置
"""
import os
import re
from typing import List, Dict, Any, Callable, Optional
from docx import Document
2025-09-22 21:10:29 +08:00
from docx.document import Document as DocxDocument
2025-09-21 19:01:40 +08:00
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from config import config
from text_processor import text_processor
from image_processor import ImageProcessor
from markdown_parser import MarkdownParser
2025-09-22 21:10:29 +08:00
from style_manager import style_manager
2025-09-21 19:01:40 +08:00
# 免责声明文本
DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`"""
class DocxGenerator:
"""DOCX文档生成器类"""
def __init__(self):
"""初始化DOCX生成器"""
self.temp_files = [] # 跟踪临时文件以便清理
2025-09-22 21:10:29 +08:00
self.current_document_style = None # 当前使用的文档样式
2025-09-21 19:01:40 +08:00
def generate(self, sections: List[Dict[str, Any]], image_files: List[str],
output_path: str, progress_callback: Optional[Callable] = None) -> bool:
"""
生成DOCX文档
Args:
sections: 解析后的文档章节列表
image_files: 图片文件路径列表
output_path: 输出文件路径
progress_callback: 进度回调函数
Returns:
bool: 是否生成成功
Raises:
Exception: 生成失败时
"""
try:
doc = Document()
self._setup_document_styles(doc)
total_sections = len(sections)
image_index = 0
image_count = len(image_files)
for i, section in enumerate(sections):
if progress_callback:
progress = int((i / total_sections) * 100)
section_title = section['content'][:30] + "..." if len(section['content']) > 30 else section['content']
progress_callback(progress, f"处理章节: {section_title}")
# 添加章节内容
image_index = self._add_section_to_doc(doc, section, image_files, image_index, image_count, output_path)
# 添加免责声明
if config.add_disclaimer:
self._add_disclaimer(doc)
# 保存文档
doc.save(output_path)
if progress_callback:
progress_callback(100, "转换完成!")
return True
except Exception as e:
raise Exception(f"生成DOCX失败: {str(e)}")
finally:
# 清理临时文件
self._cleanup_temp_files()
2025-09-22 21:10:29 +08:00
def _setup_document_styles(self, doc) -> None:
2025-09-21 19:01:40 +08:00
"""
设置文档样式
Args:
doc: DOCX文档对象
"""
try:
2025-09-22 21:10:29 +08:00
# 获取当前选中的样式
current_style = style_manager.get_style(config.current_style)
if not current_style:
print(f"警告: 找不到样式 '{config.current_style}',使用默认样式")
return
self.current_document_style = current_style
print(f"应用文档样式: {current_style.name}")
2025-09-21 19:01:40 +08:00
except Exception as e:
print(f"设置文档样式时出错: {e}")
2025-09-22 21:10:29 +08:00
def _add_section_to_doc(self, doc: DocxDocument, section: Dict[str, Any],
2025-09-21 19:01:40 +08:00
image_files: List[str], image_index: int, image_count: int,
output_path: str) -> int:
"""
添加章节内容到文档
Args:
doc: DOCX文档对象
section: 章节数据
image_files: 图片文件列表
image_index: 当前图片索引
image_count: 图片总数
output_path: 输出文件路径用于临时文件
Returns:
int: 更新后的图片索引
"""
# 添加章节标题
if section['level'] > 0 and section['level'] <= config.title_levels:
heading_text = text_processor.process_text_content(section['content'])
para = doc.add_heading(level=section['level'])
2025-09-22 21:10:29 +08:00
# 清空默认内容,应用自定义样式
para.clear()
run = para.add_run(heading_text)
# 应用标题样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.heading_styles:
if section['level'] in self.current_document_style.heading_styles:
heading_style = self.current_document_style.heading_styles[section['level']]
if heading_style.font:
run.font.name = heading_style.font.name
run.font.size = Pt(heading_style.font.size)
run.font.bold = heading_style.font.bold
run.font.italic = heading_style.font.italic
if heading_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(heading_style.font.color.replace('#', ''))
if heading_style.paragraph:
para_style = heading_style.paragraph
if para_style.line_spacing > 0:
para.paragraph_format.line_spacing = para_style.line_spacing
if para_style.space_before > 0:
para.paragraph_format.space_before = Pt(para_style.space_before)
if para_style.space_after > 0:
para.paragraph_format.space_after = Pt(para_style.space_after)
if para_style.first_line_indent > 0:
para.paragraph_format.first_line_indent = Pt(para_style.first_line_indent * 12)
# 设置对齐方式
if para_style.alignment == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif para_style.alignment == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
elif para_style.alignment == "justify":
para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
else:
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
else:
# 默认样式
run.font.size = Pt(18 - section['level'] * 2 if section['level'] <= 6 else 10)
run.font.bold = True
2025-09-21 19:01:40 +08:00
self._apply_inline_formatting(para, heading_text)
elif section['content'] != '前置内容':
heading_text = text_processor.process_text_content(section['content'])
para = doc.add_paragraph()
run = para.add_run(heading_text)
2025-09-22 21:10:29 +08:00
# 应用样式设置
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.heading_styles:
if section['level'] in self.current_document_style.heading_styles:
heading_style = self.current_document_style.heading_styles[section['level']]
if heading_style.font:
run.font.name = heading_style.font.name
run.font.size = Pt(heading_style.font.size)
run.font.bold = heading_style.font.bold
run.font.italic = heading_style.font.italic
if heading_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(heading_style.font.color.replace('#', ''))
else:
run.font.size = Pt(14)
run.font.bold = True
para.paragraph_format.space_after = Pt(12)
2025-09-21 19:01:40 +08:00
# 处理章节中的元素
elements = section.get('elements', [])
if not elements:
return image_index
# 处理第一个非空元素后插入图片
first_content_added = False
for element in elements:
# 添加元素到文档
self._add_element_to_doc(doc, element)
# 在第一个内容元素后插入图片
if not first_content_added and element['type'] not in ['empty']:
first_content_added = True
image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)
return image_index
2025-09-22 21:10:29 +08:00
def _add_element_to_doc(self, doc: DocxDocument, element: Dict[str, Any]) -> None:
2025-09-21 19:01:40 +08:00
"""
将解析的元素添加到文档中
Args:
doc: DOCX文档对象
element: 元素数据
"""
element_type = element['type']
content = text_processor.process_text_content(element.get('content', ''))
if element_type == 'paragraph':
self._add_formatted_paragraph(doc, content)
elif element_type == 'unordered_list':
para = doc.add_paragraph(style='List Bullet')
self._apply_inline_formatting(para, content)
2025-09-22 21:10:29 +08:00
# 应用列表样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.unordered_list:
list_style = self.current_document_style.unordered_list
if list_style.paragraph:
if list_style.paragraph.space_before > 0:
para.paragraph_format.space_before = Pt(list_style.paragraph.space_before)
if list_style.paragraph.space_after > 0:
para.paragraph_format.space_after = Pt(list_style.paragraph.space_after)
2025-09-21 19:01:40 +08:00
elif element_type == 'ordered_list':
para = doc.add_paragraph(style='List Number')
self._apply_inline_formatting(para, content)
2025-09-22 21:10:29 +08:00
# 应用列表样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.ordered_list:
list_style = self.current_document_style.ordered_list
if list_style.paragraph:
if list_style.paragraph.space_before > 0:
para.paragraph_format.space_before = Pt(list_style.paragraph.space_before)
if list_style.paragraph.space_after > 0:
para.paragraph_format.space_after = Pt(list_style.paragraph.space_after)
2025-09-21 19:01:40 +08:00
elif element_type == 'blockquote':
para = doc.add_paragraph(style='Quote')
self._apply_inline_formatting(para, content)
2025-09-22 21:10:29 +08:00
# 应用引用样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.quote_block:
quote_style = self.current_document_style.quote_block
if quote_style.paragraph:
if quote_style.paragraph.line_spacing > 0:
para.paragraph_format.line_spacing = quote_style.paragraph.line_spacing
if quote_style.paragraph.space_before > 0:
para.paragraph_format.space_before = Pt(quote_style.paragraph.space_before)
if quote_style.paragraph.space_after > 0:
para.paragraph_format.space_after = Pt(quote_style.paragraph.space_after)
if quote_style.paragraph.first_line_indent > 0:
para.paragraph_format.first_line_indent = Pt(quote_style.paragraph.first_line_indent * 12)
# 设置对齐方式
if quote_style.paragraph.alignment == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif quote_style.paragraph.alignment == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
elif quote_style.paragraph.alignment == "justify":
para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
2025-09-21 19:01:40 +08:00
elif element_type == 'code_block':
self._add_code_block(doc, element.get('content', ''), element.get('language', ''))
elif element_type == 'table':
self._add_table_to_doc(doc, element.get('rows', []))
elif element_type == 'horizontal_rule':
self._add_horizontal_rule(doc)
elif element_type == 'empty':
doc.add_paragraph()
2025-09-22 21:10:29 +08:00
def _add_formatted_paragraph(self, doc: DocxDocument, content: str) -> None:
2025-09-21 19:01:40 +08:00
"""
添加带格式的段落
Args:
doc: DOCX文档对象
content: 段落内容
"""
if not content or not content.strip():
doc.add_paragraph()
return
para = doc.add_paragraph()
self._apply_inline_formatting(para, content)
2025-09-22 21:10:29 +08:00
# 应用样式中的段落格式
if hasattr(self, 'current_document_style') and self.current_document_style:
if self.current_document_style.body_paragraph:
body_para = self.current_document_style.body_paragraph
if body_para.line_spacing > 0:
para.paragraph_format.line_spacing = body_para.line_spacing
if body_para.space_before > 0:
para.paragraph_format.space_before = Pt(body_para.space_before)
if body_para.space_after > 0:
para.paragraph_format.space_after = Pt(body_para.space_after)
if body_para.first_line_indent > 0:
para.paragraph_format.first_line_indent = Pt(body_para.first_line_indent * 12) # 字符转磅
# 设置对齐方式
if body_para.alignment == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif body_para.alignment == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
elif body_para.alignment == "justify":
para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
else:
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
elif config.line_spacing > 0:
2025-09-21 19:01:40 +08:00
para.paragraph_format.line_spacing = config.line_spacing
def _apply_inline_formatting(self, paragraph, text: str) -> None:
"""
应用行内格式到段落
Args:
paragraph: DOCX段落对象
text: 要格式化的文本
"""
# 首先处理文字内容(已在调用前处理)
processed_text = text
# 提取格式信息
formatting = MarkdownParser.extract_inline_formatting(processed_text)
# 如果没有格式,直接添加文本
if not formatting:
2025-09-22 21:10:29 +08:00
run = paragraph.add_run(processed_text)
self._apply_body_font_style(run)
2025-09-21 19:01:40 +08:00
return
current_pos = 0
for fmt in formatting:
# 添加格式前的普通文本
if fmt['start'] > current_pos:
2025-09-22 21:10:29 +08:00
run = paragraph.add_run(processed_text[current_pos:fmt['start']])
self._apply_body_font_style(run)
2025-09-21 19:01:40 +08:00
# 创建格式化的run
if fmt['type'] == 'bold':
clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
2025-09-22 21:10:29 +08:00
self._apply_body_font_style(run)
2025-09-21 19:01:40 +08:00
run.bold = True
elif fmt['type'] == 'italic':
clean_text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)|_(.+?)_', r'\1\2',
processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
2025-09-22 21:10:29 +08:00
self._apply_body_font_style(run)
2025-09-21 19:01:40 +08:00
run.italic = True
elif fmt['type'] == 'code':
clean_text = re.sub(r'`([^`]+)`', r'\1', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
2025-09-22 21:10:29 +08:00
# 代码样式优先使用样式中的设置
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.code_block:
code_style = self.current_document_style.code_block
if code_style.font:
run.font.name = code_style.font.name
run.font.size = Pt(code_style.font.size)
if code_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(code_style.font.color.replace('#', ''))
else:
run.font.name = 'Courier New'
run.font.size = Pt(10)
2025-09-21 19:01:40 +08:00
elif fmt['type'] == 'strikethrough':
clean_text = re.sub(r'~~(.+?)~~', r'\1', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
2025-09-22 21:10:29 +08:00
self._apply_body_font_style(run)
2025-09-21 19:01:40 +08:00
run.font.strike = True
elif fmt['type'] == 'link':
# 对于链接,只显示链接文本
run = paragraph.add_run(fmt['text'])
2025-09-22 21:10:29 +08:00
self._apply_body_font_style(run)
2025-09-21 19:01:40 +08:00
run.font.color.rgb = RGBColor(0, 0, 255) # 蓝色
run.underline = True
current_pos = fmt['end']
# 添加剩余的普通文本
if current_pos < len(processed_text):
2025-09-22 21:10:29 +08:00
run = paragraph.add_run(processed_text[current_pos:])
self._apply_body_font_style(run)
2025-09-21 19:01:40 +08:00
2025-09-22 21:10:29 +08:00
def _apply_body_font_style(self, run) -> None:
"""
应用正文字体样式到run
Args:
run: DOCX run对象
"""
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.body_font:
body_font = self.current_document_style.body_font
run.font.name = body_font.name
run.font.size = Pt(body_font.size)
run.font.bold = body_font.bold
run.font.italic = body_font.italic
if body_font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(body_font.color.replace('#', ''))
def _add_code_block(self, doc: DocxDocument, content: str, language: str) -> None:
2025-09-21 19:01:40 +08:00
"""
添加代码块
Args:
doc: DOCX文档对象
content: 代码内容
language: 编程语言
"""
para = doc.add_paragraph(style='No Spacing')
run = para.add_run(content)
2025-09-22 21:10:29 +08:00
# 应用代码块样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.code_block:
code_style = self.current_document_style.code_block
if code_style.font:
run.font.name = code_style.font.name
run.font.size = Pt(code_style.font.size)
run.font.bold = code_style.font.bold
run.font.italic = code_style.font.italic
if code_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(code_style.font.color.replace('#', ''))
if code_style.paragraph:
para_style = code_style.paragraph
if para_style.space_before > 0:
para.paragraph_format.space_before = Pt(para_style.space_before)
if para_style.space_after > 0:
para.paragraph_format.space_after = Pt(para_style.space_after)
else:
# 默认样式
run.font.name = 'Courier New'
run.font.size = Pt(10)
2025-09-21 19:01:40 +08:00
para.paragraph_format.space_before = Pt(6)
para.paragraph_format.space_after = Pt(6)
2025-09-22 21:10:29 +08:00
def _add_table_to_doc(self, doc: DocxDocument, rows: List[List[str]]) -> None:
2025-09-21 19:01:40 +08:00
"""
添加表格到文档
Args:
doc: DOCX文档对象
rows: 表格行数据
"""
if not rows:
return
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
table.style = 'Table Grid'
for i, row_data in enumerate(rows):
row_cells = table.rows[i].cells
for j, cell_data in enumerate(row_data):
if j < len(row_cells):
processed_text = text_processor.process_text_content(cell_data)
2025-09-22 21:10:29 +08:00
cell_para = row_cells[j].paragraphs[0]
cell_para.clear()
run = cell_para.add_run(processed_text)
# 应用表格样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.table_style:
table_style = self.current_document_style.table_style
if table_style.font:
run.font.name = table_style.font.name
run.font.size = Pt(table_style.font.size)
run.font.bold = table_style.font.bold
run.font.italic = table_style.font.italic
if table_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(table_style.font.color.replace('#', ''))
if table_style.paragraph:
para_style = table_style.paragraph
if para_style.space_before > 0:
cell_para.paragraph_format.space_before = Pt(para_style.space_before)
if para_style.space_after > 0:
cell_para.paragraph_format.space_after = Pt(para_style.space_after)
def _add_horizontal_rule(self, doc: DocxDocument) -> None:
2025-09-21 19:01:40 +08:00
"""
在文档中添加横线
Args:
doc: DOCX文档对象
"""
para = doc.add_paragraph()
run = para.add_run()
run.font.underline = True
run.text = " " * 100 # 足够长的下划线作为横线
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
2025-09-22 21:10:29 +08:00
def _insert_section_image(self, doc: DocxDocument, image_files: List[str],
2025-09-21 19:01:40 +08:00
image_index: int, image_count: int, output_path: str) -> int:
"""
为章节插入图片
Args:
doc: DOCX文档对象
image_files: 图片文件列表
image_index: 当前图片索引
image_count: 图片总数
output_path: 输出文件路径
Returns:
int: 更新后的图片索引
"""
if image_count > 0 and image_index < image_count:
try:
self._insert_image(doc, image_files[image_index], output_path)
image_index += 1
# 根据策略处理图片不足的情况
if image_index >= image_count:
if config.image_strategy == "cycle":
image_index = 0
elif config.image_strategy == "truncate":
image_index = image_count
# repeat_last策略保持当前索引-1下次还用最后一张
except Exception as e:
# 插入失败时添加错误提示
para = doc.add_paragraph()
run = para.add_run(f"[图片插入失败: {str(e)}]")
run.font.color.rgb = RGBColor(255, 0, 0) # 红色
return image_index
2025-09-22 21:10:29 +08:00
def _insert_image(self, doc: DocxDocument, image_path: str, output_path: str) -> None:
2025-09-21 19:01:40 +08:00
"""
插入图片到文档
Args:
doc: DOCX文档对象
image_path: 图片文件路径
output_path: 输出文件路径用于临时文件
"""
try:
# 处理图片
img, width = ImageProcessor.process_image(image_path)
temp_img_path = None
if config.image_resize == "width":
# 需要保存临时图片
temp_dir = os.path.dirname(output_path)
os.makedirs(temp_dir, exist_ok=True)
temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png")
img.save(temp_img_path)
self.temp_files.append(temp_img_path)
img_path = temp_img_path
else:
img_path = image_path
# 创建段落并插入图片
para = doc.add_paragraph()
run = para.runs[0] if para.runs else para.add_run()
run.add_picture(img_path, width=Inches(width))
para.alignment = ImageProcessor.get_image_alignment()
except Exception as e:
raise Exception(f"插入图片失败: {str(e)}")
2025-09-22 21:10:29 +08:00
def _add_disclaimer(self, doc: DocxDocument) -> None:
2025-09-21 19:01:40 +08:00
"""
添加免责声明
Args:
doc: DOCX文档对象
"""
doc.add_paragraph("---")
para = doc.add_paragraph()
disclaimer_text = text_processor.process_text_content(DISCLAIMER_TEXT)
run = para.add_run(disclaimer_text)
run.font.size = Pt(10)
para.paragraph_format.line_spacing = 1.0
def _cleanup_temp_files(self) -> None:
"""清理临时文件"""
for temp_file in self.temp_files:
try:
if os.path.exists(temp_file):
os.remove(temp_file)
except Exception as e:
print(f"清理临时文件失败 {temp_file}: {e}")
self.temp_files.clear()
# 创建全局DOCX生成器实例
docx_generator = DocxGenerator()
# 兼容旧接口的函数
def generate(sections: List[Dict[str, Any]], image_files: List[str],
output_path: str, progress_callback: Optional[Callable] = None) -> bool:
"""生成DOCX文档兼容旧接口"""
return docx_generator.generate(sections, image_files, output_path, progress_callback)