TxT2Docx/docx_generator.py
wsb1224 6b5e4adea6 更新图片插入位置:
有标题时选择标题前后插入
无标题是可自定义段落插入
2025-10-15 17:10:50 +08:00

654 lines
28 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
DOCX文档生成模块
负责将解析后的Markdown结构转换为DOCX文档包括文本格式化、图片插入和样式设置。
"""
import os
import re
from typing import List, Dict, Any, Callable, Optional
from docx import Document
from docx.document import Document as DocxDocument
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from config import config
from text_processor import text_processor
from image_processor import ImageProcessor
from markdown_parser import MarkdownParser
from style_manager import style_manager
# 免责声明文本
DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`"""
class DocxGenerator:
"""DOCX文档生成器类"""
def __init__(self):
"""初始化DOCX生成器"""
self.temp_files = [] # 跟踪临时文件以便清理
self.current_document_style = None # 当前使用的文档样式
self.paragraph_count = 0 # 段落计数器,用于无标题文章的图片插入控制
def generate(self, sections: List[Dict[str, Any]], image_files: List[str],
output_path: str, progress_callback: Optional[Callable] = None) -> bool:
"""
生成DOCX文档
Args:
sections: 解析后的文档章节列表
image_files: 图片文件路径列表
output_path: 输出文件路径
progress_callback: 进度回调函数
Returns:
bool: 是否生成成功
Raises:
Exception: 生成失败时
"""
try:
doc = Document()
self._setup_document_styles(doc)
total_sections = len(sections)
image_index = 0
image_count = len(image_files)
for i, section in enumerate(sections):
if progress_callback:
progress = int((i / total_sections) * 100)
section_title = section['content'][:30] + "..." if len(section['content']) > 30 else section['content']
progress_callback(progress, f"处理章节: {section_title}")
# 添加章节内容
image_index = self._add_section_to_doc(doc, section, image_files, image_index, image_count, output_path)
# 添加免责声明
if config.add_disclaimer:
self._add_disclaimer(doc)
# 保存文档
doc.save(output_path)
if progress_callback:
progress_callback(100, "转换完成!")
return True
except Exception as e:
raise Exception(f"生成DOCX失败: {str(e)}")
finally:
# 清理临时文件
self._cleanup_temp_files()
def _setup_document_styles(self, doc) -> None:
"""
设置文档样式
Args:
doc: DOCX文档对象
"""
try:
# 获取当前选中的样式
current_style = style_manager.get_style(config.current_style)
if not current_style:
print(f"警告: 找不到样式 '{config.current_style}',使用默认样式")
return
self.current_document_style = current_style
print(f"应用文档样式: {current_style.name}")
except Exception as e:
print(f"设置文档样式时出错: {e}")
def _add_section_to_doc(self, doc: DocxDocument, section: Dict[str, Any],
image_files: List[str], image_index: int, image_count: int,
output_path: str) -> int:
"""
添加章节内容到文档
Args:
doc: DOCX文档对象
section: 章节数据
image_files: 图片文件列表
image_index: 当前图片索引
image_count: 图片总数
output_path: 输出文件路径(用于临时文件)
Returns:
int: 更新后的图片索引
"""
# 添加章节标题
if section['level'] > 0 and section['level'] <= config.title_levels:
heading_text = text_processor.process_text_content(section['content'])
para = doc.add_heading(level=section['level'])
# 清空默认内容,应用自定义样式
para.clear()
run = para.add_run(heading_text)
# 应用标题样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.heading_styles:
if section['level'] in self.current_document_style.heading_styles:
heading_style = self.current_document_style.heading_styles[section['level']]
if heading_style.font:
run.font.name = heading_style.font.name
run.font.size = Pt(heading_style.font.size)
run.font.bold = heading_style.font.bold
run.font.italic = heading_style.font.italic
if heading_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(heading_style.font.color.replace('#', ''))
if heading_style.paragraph:
para_style = heading_style.paragraph
if para_style.line_spacing > 0:
para.paragraph_format.line_spacing = para_style.line_spacing
if para_style.space_before > 0:
para.paragraph_format.space_before = Pt(para_style.space_before)
if para_style.space_after > 0:
para.paragraph_format.space_after = Pt(para_style.space_after)
if para_style.first_line_indent > 0:
para.paragraph_format.first_line_indent = Pt(para_style.first_line_indent * 12)
# 设置对齐方式
if para_style.alignment == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif para_style.alignment == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
elif para_style.alignment == "justify":
para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
else:
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
else:
# 默认样式
run.font.size = Pt(18 - section['level'] * 2 if section['level'] <= 6 else 10)
run.font.bold = True
self._apply_inline_formatting(para, heading_text)
# 如果有标题,根据配置决定在标题前还是后插入图片
if image_count > 0 and image_index < image_count:
# 检查是否需要在标题前插入图片
if hasattr(config, 'image_insert_position') and config.image_insert_position == "before_title":
image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)
elif section['content'] != '前置内容':
heading_text = text_processor.process_text_content(section['content'])
para = doc.add_paragraph()
run = para.add_run(heading_text)
# 应用样式设置
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.heading_styles:
if section['level'] in self.current_document_style.heading_styles:
heading_style = self.current_document_style.heading_styles[section['level']]
if heading_style.font:
run.font.name = heading_style.font.name
run.font.size = Pt(heading_style.font.size)
run.font.bold = heading_style.font.bold
run.font.italic = heading_style.font.italic
if heading_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(heading_style.font.color.replace('#', ''))
else:
run.font.size = Pt(14)
run.font.bold = True
para.paragraph_format.space_after = Pt(12)
# 处理章节中的元素
elements = section.get('elements', [])
if not elements:
return image_index
# 处理元素
for element in elements:
# 添加元素到文档
self._add_element_to_doc(doc, element)
# 根据文章结构决定图片插入策略
if element['type'] not in ['empty']:
# 如果有标题,根据配置决定在标题后插入图片
if section['level'] > 0 and section['level'] <= config.title_levels:
# 有标题的文章,在标题后的第一个内容后插入图片
if hasattr(config, 'image_insert_position') and config.image_insert_position == "after_title":
image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)
# 插入一次后就不再插入,直到下一个标题
break
else:
# 无标题的文章,根据段落计数控制图片插入间隔
self.paragraph_count += 1
if image_count > 0 and image_index < image_count:
# 检查是否需要插入图片(根据配置的间隔)
image_insert_interval = getattr(config, 'image_insert_interval', 5) # 默认每5段插入一张图片
if self.paragraph_count % image_insert_interval == 0:
image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)
return image_index
def _add_element_to_doc(self, doc: DocxDocument, element: Dict[str, Any]) -> None:
"""
将解析的元素添加到文档中
Args:
doc: DOCX文档对象
element: 元素数据
"""
element_type = element['type']
content = text_processor.process_text_content(element.get('content', ''))
if element_type == 'paragraph':
self._add_formatted_paragraph(doc, content)
elif element_type == 'unordered_list':
para = doc.add_paragraph(style='List Bullet')
self._apply_inline_formatting(para, content)
# 应用列表样式
self._apply_list_style(para, 'unordered')
elif element_type == 'ordered_list':
para = doc.add_paragraph(style='List Number')
self._apply_inline_formatting(para, content)
# 应用列表样式
self._apply_list_style(para, 'ordered')
elif element_type == 'blockquote':
para = doc.add_paragraph(style='Quote')
self._apply_inline_formatting(para, content)
# 应用引用样式
self._apply_quote_style(para)
elif element_type == 'code_block':
self._add_code_block(doc, element.get('content', ''), element.get('language', ''))
elif element_type == 'table':
self._add_table_to_doc(doc, element.get('rows', []))
elif element_type == 'horizontal_rule':
self._add_horizontal_rule(doc)
elif element_type == 'empty':
doc.add_paragraph()
def _apply_list_style(self, paragraph, list_type: str) -> None:
"""
应用列表样式到段落
Args:
paragraph: DOCX段落对象
list_type: 列表类型 ('unordered''ordered')
"""
if not (hasattr(self, 'current_document_style') and self.current_document_style):
return
list_style = None
if list_type == 'unordered' and self.current_document_style.unordered_list:
list_style = self.current_document_style.unordered_list
elif list_type == 'ordered' and self.current_document_style.ordered_list:
list_style = self.current_document_style.ordered_list
if list_style and list_style.paragraph:
if list_style.paragraph.space_before > 0:
paragraph.paragraph_format.space_before = Pt(list_style.paragraph.space_before)
if list_style.paragraph.space_after > 0:
paragraph.paragraph_format.space_after = Pt(list_style.paragraph.space_after)
def _apply_quote_style(self, paragraph) -> None:
"""
应用引用块样式到段落
Args:
paragraph: DOCX段落对象
"""
if not (hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.quote_block):
return
quote_style = self.current_document_style.quote_block
if quote_style.paragraph:
if quote_style.paragraph.line_spacing > 0:
paragraph.paragraph_format.line_spacing = quote_style.paragraph.line_spacing
if quote_style.paragraph.space_before > 0:
paragraph.paragraph_format.space_before = Pt(quote_style.paragraph.space_before)
if quote_style.paragraph.space_after > 0:
paragraph.paragraph_format.space_after = Pt(quote_style.paragraph.space_after)
if quote_style.paragraph.first_line_indent > 0:
paragraph.paragraph_format.first_line_indent = Pt(quote_style.paragraph.first_line_indent * 12)
# 设置对齐方式
if quote_style.paragraph.alignment == "center":
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif quote_style.paragraph.alignment == "right":
paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
elif quote_style.paragraph.alignment == "justify":
paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
def _add_formatted_paragraph(self, doc: DocxDocument, content: str) -> None:
"""
添加带格式的段落
Args:
doc: DOCX文档对象
content: 段落内容
"""
if not content or not content.strip():
doc.add_paragraph()
return
para = doc.add_paragraph()
self._apply_inline_formatting(para, content)
# 应用样式中的段落格式
if hasattr(self, 'current_document_style') and self.current_document_style:
if self.current_document_style.body_paragraph:
body_para = self.current_document_style.body_paragraph
if body_para.line_spacing > 0:
para.paragraph_format.line_spacing = body_para.line_spacing
if body_para.space_before > 0:
para.paragraph_format.space_before = Pt(body_para.space_before)
if body_para.space_after > 0:
para.paragraph_format.space_after = Pt(body_para.space_after)
if body_para.first_line_indent > 0:
para.paragraph_format.first_line_indent = Pt(body_para.first_line_indent * 12) # 字符转磅
# 设置对齐方式
if body_para.alignment == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif body_para.alignment == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
elif body_para.alignment == "justify":
para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
else:
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
elif config.line_spacing > 0:
para.paragraph_format.line_spacing = config.line_spacing
def _apply_inline_formatting(self, paragraph, text: str) -> None:
"""
应用行内格式到段落
Args:
paragraph: DOCX段落对象
text: 要格式化的文本
"""
# 首先处理文字内容(已在调用前处理)
processed_text = text
# 提取格式信息
formatting = MarkdownParser.extract_inline_formatting(processed_text)
# 如果没有格式,直接添加文本
if not formatting:
run = paragraph.add_run(processed_text)
self._apply_body_font_style(run)
return
current_pos = 0
for fmt in formatting:
# 添加格式前的普通文本
if fmt['start'] > current_pos:
run = paragraph.add_run(processed_text[current_pos:fmt['start']])
self._apply_body_font_style(run)
# 创建格式化的run
if fmt['type'] == 'bold':
clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
self._apply_body_font_style(run)
run.bold = True
elif fmt['type'] == 'italic':
clean_text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)|_(.+?)_', r'\1\2',
processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
self._apply_body_font_style(run)
run.italic = True
elif fmt['type'] == 'code':
clean_text = re.sub(r'`([^`]+)`', r'\1', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
# 代码样式优先使用样式中的设置
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.code_block:
code_style = self.current_document_style.code_block
if code_style.font:
run.font.name = code_style.font.name
run.font.size = Pt(code_style.font.size)
if code_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(code_style.font.color.replace('#', ''))
else:
run.font.name = 'Courier New'
run.font.size = Pt(10)
elif fmt['type'] == 'strikethrough':
clean_text = re.sub(r'~~(.+?)~~', r'\1', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
self._apply_body_font_style(run)
run.font.strike = True
elif fmt['type'] == 'link':
# 对于链接,只显示链接文本
run = paragraph.add_run(fmt['text'])
self._apply_body_font_style(run)
run.font.color.rgb = RGBColor(0, 0, 255) # 蓝色
run.underline = True
current_pos = fmt['end']
# 添加剩余的普通文本
if current_pos < len(processed_text):
run = paragraph.add_run(processed_text[current_pos:])
self._apply_body_font_style(run)
def _apply_body_font_style(self, run) -> None:
"""
应用正文字体样式到run
Args:
run: DOCX run对象
"""
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.body_font:
body_font = self.current_document_style.body_font
run.font.name = body_font.name
run.font.size = Pt(body_font.size)
run.font.bold = body_font.bold
run.font.italic = body_font.italic
if body_font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(body_font.color.replace('#', ''))
def _add_code_block(self, doc: DocxDocument, content: str, language: str) -> None:
"""
添加代码块
Args:
doc: DOCX文档对象
content: 代码内容
language: 编程语言
"""
para = doc.add_paragraph(style='No Spacing')
run = para.add_run(content)
# 应用代码块样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.code_block:
code_style = self.current_document_style.code_block
if code_style.font:
run.font.name = code_style.font.name
run.font.size = Pt(code_style.font.size)
run.font.bold = code_style.font.bold
run.font.italic = code_style.font.italic
if code_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(code_style.font.color.replace('#', ''))
if code_style.paragraph:
para_style = code_style.paragraph
if para_style.space_before > 0:
para.paragraph_format.space_before = Pt(para_style.space_before)
if para_style.space_after > 0:
para.paragraph_format.space_after = Pt(para_style.space_after)
else:
# 默认样式
run.font.name = 'Courier New'
run.font.size = Pt(10)
para.paragraph_format.space_before = Pt(6)
para.paragraph_format.space_after = Pt(6)
def _add_table_to_doc(self, doc: DocxDocument, rows: List[List[str]]) -> None:
"""
添加表格到文档
Args:
doc: DOCX文档对象
rows: 表格行数据
"""
if not rows:
return
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
table.style = 'Table Grid'
for i, row_data in enumerate(rows):
row_cells = table.rows[i].cells
for j, cell_data in enumerate(row_data):
if j < len(row_cells):
processed_text = text_processor.process_text_content(cell_data)
cell_para = row_cells[j].paragraphs[0]
cell_para.clear()
run = cell_para.add_run(processed_text)
# 应用表格样式
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.table_style:
table_style = self.current_document_style.table_style
if table_style.font:
run.font.name = table_style.font.name
run.font.size = Pt(table_style.font.size)
run.font.bold = table_style.font.bold
run.font.italic = table_style.font.italic
if table_style.font.color != "#000000":
run.font.color.rgb = RGBColor.from_string(table_style.font.color.replace('#', ''))
if table_style.paragraph:
para_style = table_style.paragraph
if para_style.space_before > 0:
cell_para.paragraph_format.space_before = Pt(para_style.space_before)
if para_style.space_after > 0:
cell_para.paragraph_format.space_after = Pt(para_style.space_after)
def _add_horizontal_rule(self, doc: DocxDocument) -> None:
"""
在文档中添加横线
Args:
doc: DOCX文档对象
"""
para = doc.add_paragraph()
run = para.add_run()
run.font.underline = True
run.text = " " * 100 # 足够长的下划线作为横线
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
def _insert_section_image(self, doc: DocxDocument, image_files: List[str],
image_index: int, image_count: int, output_path: str) -> int:
"""
为章节插入图片
Args:
doc: DOCX文档对象
image_files: 图片文件列表
image_index: 当前图片索引
image_count: 图片总数
output_path: 输出文件路径
Returns:
int: 更新后的图片索引
"""
if image_count > 0 and image_index < image_count:
try:
self._insert_image(doc, image_files[image_index], output_path)
image_index += 1
# 根据策略处理图片不足的情况
if image_index >= image_count:
if config.image_strategy == "cycle":
image_index = 0
elif config.image_strategy == "truncate":
image_index = image_count
# repeat_last策略保持当前索引-1下次还用最后一张
except Exception as e:
# 插入失败时添加错误提示
para = doc.add_paragraph()
run = para.add_run(f"[图片插入失败: {str(e)}]")
run.font.color.rgb = RGBColor(255, 0, 0) # 红色
return image_index
def _insert_image(self, doc: DocxDocument, image_path: str, output_path: str) -> None:
"""
插入图片到文档
Args:
doc: DOCX文档对象
image_path: 图片文件路径
output_path: 输出文件路径(用于临时文件)
"""
try:
# 使用优化方法处理图片
temp_dir = os.path.join(os.path.dirname(output_path), "temp_images")
os.makedirs(temp_dir, exist_ok=True)
optimized_image_path = ImageProcessor.optimize_image_for_docx(image_path, temp_dir)
# 处理图片(方向修正和尺寸调整)
img, width = ImageProcessor.process_image(optimized_image_path)
temp_img_path = None
if config.image_resize == "width":
# 需要保存临时图片
temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png")
img.save(temp_img_path)
self.temp_files.append(temp_img_path)
img_path = temp_img_path
else:
img_path = optimized_image_path if optimized_image_path != image_path else image_path
# 创建段落并插入图片
para = doc.add_paragraph()
run = para.runs[0] if para.runs else para.add_run()
run.add_picture(img_path, width=Inches(width))
para.alignment = ImageProcessor.get_image_alignment()
except Exception as e:
raise Exception(f"插入图片失败: {str(e)}")
def _add_disclaimer(self, doc: DocxDocument) -> None:
"""
添加免责声明
Args:
doc: DOCX文档对象
"""
doc.add_paragraph("---")
para = doc.add_paragraph()
disclaimer_text = text_processor.process_text_content(DISCLAIMER_TEXT)
run = para.add_run(disclaimer_text)
run.font.size = Pt(10)
para.paragraph_format.line_spacing = 1.0
def _cleanup_temp_files(self) -> None:
"""清理临时文件"""
for temp_file in self.temp_files:
try:
if os.path.exists(temp_file):
os.remove(temp_file)
except Exception as e:
print(f"清理临时文件失败 {temp_file}: {e}")
self.temp_files.clear()
# 创建全局DOCX生成器实例
docx_generator = DocxGenerator()
# 兼容旧接口的函数
def generate(sections: List[Dict[str, Any]], image_files: List[str],
output_path: str, progress_callback: Optional[Callable] = None) -> bool:
"""生成DOCX文档兼容旧接口"""
return docx_generator.generate(sections, image_files, output_path, progress_callback)