TxT2Docx/docx_generator.py
2025-09-21 19:01:40 +08:00

428 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
DOCX文档生成模块
负责将解析后的Markdown结构转换为DOCX文档包括文本格式化、图片插入和样式设置。
"""
import os
import re
from typing import List, Dict, Any, Callable, Optional
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from config import config
from text_processor import text_processor
from image_processor import ImageProcessor
from markdown_parser import MarkdownParser
# 免责声明文本
DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`"""
class DocxGenerator:
"""DOCX文档生成器类"""
def __init__(self):
"""初始化DOCX生成器"""
self.temp_files = [] # 跟踪临时文件以便清理
def generate(self, sections: List[Dict[str, Any]], image_files: List[str],
output_path: str, progress_callback: Optional[Callable] = None) -> bool:
"""
生成DOCX文档
Args:
sections: 解析后的文档章节列表
image_files: 图片文件路径列表
output_path: 输出文件路径
progress_callback: 进度回调函数
Returns:
bool: 是否生成成功
Raises:
Exception: 生成失败时
"""
try:
doc = Document()
self._setup_document_styles(doc)
total_sections = len(sections)
image_index = 0
image_count = len(image_files)
for i, section in enumerate(sections):
if progress_callback:
progress = int((i / total_sections) * 100)
section_title = section['content'][:30] + "..." if len(section['content']) > 30 else section['content']
progress_callback(progress, f"处理章节: {section_title}")
# 添加章节内容
image_index = self._add_section_to_doc(doc, section, image_files, image_index, image_count, output_path)
# 添加免责声明
if config.add_disclaimer:
self._add_disclaimer(doc)
# 保存文档
doc.save(output_path)
if progress_callback:
progress_callback(100, "转换完成!")
return True
except Exception as e:
raise Exception(f"生成DOCX失败: {str(e)}")
finally:
# 清理临时文件
self._cleanup_temp_files()
def _setup_document_styles(self, doc: Document) -> None:
"""
设置文档样式
Args:
doc: DOCX文档对象
"""
try:
# 设置默认字体和行距
styles = doc.styles
# 设置正文样式
if 'Normal' in styles:
normal_style = styles['Normal']
if config.line_spacing > 0:
normal_style.paragraph_format.line_spacing = config.line_spacing
except Exception as e:
print(f"设置文档样式时出错: {e}")
def _add_section_to_doc(self, doc: Document, section: Dict[str, Any],
image_files: List[str], image_index: int, image_count: int,
output_path: str) -> int:
"""
添加章节内容到文档
Args:
doc: DOCX文档对象
section: 章节数据
image_files: 图片文件列表
image_index: 当前图片索引
image_count: 图片总数
output_path: 输出文件路径(用于临时文件)
Returns:
int: 更新后的图片索引
"""
# 添加章节标题
if section['level'] > 0 and section['level'] <= config.title_levels:
heading_text = text_processor.process_text_content(section['content'])
para = doc.add_heading(level=section['level'])
self._apply_inline_formatting(para, heading_text)
elif section['content'] != '前置内容':
heading_text = text_processor.process_text_content(section['content'])
para = doc.add_paragraph()
run = para.add_run(heading_text)
run.font.size = Pt(14)
run.font.bold = True
para.space_after = Pt(12)
# 处理章节中的元素
elements = section.get('elements', [])
if not elements:
return image_index
# 处理第一个非空元素后插入图片
first_content_added = False
for element in elements:
# 添加元素到文档
self._add_element_to_doc(doc, element)
# 在第一个内容元素后插入图片
if not first_content_added and element['type'] not in ['empty']:
first_content_added = True
image_index = self._insert_section_image(doc, image_files, image_index, image_count, output_path)
return image_index
def _add_element_to_doc(self, doc: Document, element: Dict[str, Any]) -> None:
"""
将解析的元素添加到文档中
Args:
doc: DOCX文档对象
element: 元素数据
"""
element_type = element['type']
content = text_processor.process_text_content(element.get('content', ''))
if element_type == 'paragraph':
self._add_formatted_paragraph(doc, content)
elif element_type == 'unordered_list':
para = doc.add_paragraph(style='List Bullet')
self._apply_inline_formatting(para, content)
elif element_type == 'ordered_list':
para = doc.add_paragraph(style='List Number')
self._apply_inline_formatting(para, content)
elif element_type == 'blockquote':
para = doc.add_paragraph(style='Quote')
self._apply_inline_formatting(para, content)
elif element_type == 'code_block':
self._add_code_block(doc, element.get('content', ''), element.get('language', ''))
elif element_type == 'table':
self._add_table_to_doc(doc, element.get('rows', []))
elif element_type == 'horizontal_rule':
self._add_horizontal_rule(doc)
elif element_type == 'empty':
doc.add_paragraph()
def _add_formatted_paragraph(self, doc: Document, content: str) -> None:
"""
添加带格式的段落
Args:
doc: DOCX文档对象
content: 段落内容
"""
if not content or not content.strip():
doc.add_paragraph()
return
para = doc.add_paragraph()
self._apply_inline_formatting(para, content)
if config.line_spacing > 0:
para.paragraph_format.line_spacing = config.line_spacing
def _apply_inline_formatting(self, paragraph, text: str) -> None:
"""
应用行内格式到段落
Args:
paragraph: DOCX段落对象
text: 要格式化的文本
"""
# 首先处理文字内容(已在调用前处理)
processed_text = text
# 提取格式信息
formatting = MarkdownParser.extract_inline_formatting(processed_text)
# 如果没有格式,直接添加文本
if not formatting:
paragraph.add_run(processed_text)
return
current_pos = 0
for fmt in formatting:
# 添加格式前的普通文本
if fmt['start'] > current_pos:
paragraph.add_run(processed_text[current_pos:fmt['start']])
# 创建格式化的run
if fmt['type'] == 'bold':
clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
run.bold = True
elif fmt['type'] == 'italic':
clean_text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)|_(.+?)_', r'\1\2',
processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
run.italic = True
elif fmt['type'] == 'code':
clean_text = re.sub(r'`([^`]+)`', r'\1', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
run.font.name = 'Courier New'
run.font.size = Pt(10)
elif fmt['type'] == 'strikethrough':
clean_text = re.sub(r'~~(.+?)~~', r'\1', processed_text[fmt['start']:fmt['end']])
run = paragraph.add_run(clean_text)
run.font.strike = True
elif fmt['type'] == 'link':
# 对于链接,只显示链接文本
run = paragraph.add_run(fmt['text'])
run.font.color.rgb = RGBColor(0, 0, 255) # 蓝色
run.underline = True
current_pos = fmt['end']
# 添加剩余的普通文本
if current_pos < len(processed_text):
paragraph.add_run(processed_text[current_pos:])
def _add_code_block(self, doc: Document, content: str, language: str) -> None:
"""
添加代码块
Args:
doc: DOCX文档对象
content: 代码内容
language: 编程语言
"""
para = doc.add_paragraph(style='No Spacing')
run = para.add_run(content)
run.font.name = 'Courier New'
run.font.size = Pt(10)
# 设置背景色(如果支持)
try:
para.paragraph_format.space_before = Pt(6)
para.paragraph_format.space_after = Pt(6)
except:
pass
def _add_table_to_doc(self, doc: Document, rows: List[List[str]]) -> None:
"""
添加表格到文档
Args:
doc: DOCX文档对象
rows: 表格行数据
"""
if not rows:
return
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
table.style = 'Table Grid'
for i, row_data in enumerate(rows):
row_cells = table.rows[i].cells
for j, cell_data in enumerate(row_data):
if j < len(row_cells):
processed_text = text_processor.process_text_content(cell_data)
row_cells[j].text = processed_text
def _add_horizontal_rule(self, doc: Document) -> None:
"""
在文档中添加横线
Args:
doc: DOCX文档对象
"""
para = doc.add_paragraph()
run = para.add_run()
run.font.underline = True
run.text = " " * 100 # 足够长的下划线作为横线
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
def _insert_section_image(self, doc: Document, image_files: List[str],
image_index: int, image_count: int, output_path: str) -> int:
"""
为章节插入图片
Args:
doc: DOCX文档对象
image_files: 图片文件列表
image_index: 当前图片索引
image_count: 图片总数
output_path: 输出文件路径
Returns:
int: 更新后的图片索引
"""
if image_count > 0 and image_index < image_count:
try:
self._insert_image(doc, image_files[image_index], output_path)
image_index += 1
# 根据策略处理图片不足的情况
if image_index >= image_count:
if config.image_strategy == "cycle":
image_index = 0
elif config.image_strategy == "truncate":
image_index = image_count
# repeat_last策略保持当前索引-1下次还用最后一张
except Exception as e:
# 插入失败时添加错误提示
para = doc.add_paragraph()
run = para.add_run(f"[图片插入失败: {str(e)}]")
run.font.color.rgb = RGBColor(255, 0, 0) # 红色
return image_index
def _insert_image(self, doc: Document, image_path: str, output_path: str) -> None:
"""
插入图片到文档
Args:
doc: DOCX文档对象
image_path: 图片文件路径
output_path: 输出文件路径(用于临时文件)
"""
try:
# 处理图片
img, width = ImageProcessor.process_image(image_path)
temp_img_path = None
if config.image_resize == "width":
# 需要保存临时图片
temp_dir = os.path.dirname(output_path)
os.makedirs(temp_dir, exist_ok=True)
temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png")
img.save(temp_img_path)
self.temp_files.append(temp_img_path)
img_path = temp_img_path
else:
img_path = image_path
# 创建段落并插入图片
para = doc.add_paragraph()
run = para.runs[0] if para.runs else para.add_run()
run.add_picture(img_path, width=Inches(width))
para.alignment = ImageProcessor.get_image_alignment()
except Exception as e:
raise Exception(f"插入图片失败: {str(e)}")
def _add_disclaimer(self, doc: Document) -> None:
"""
添加免责声明
Args:
doc: DOCX文档对象
"""
doc.add_paragraph("---")
para = doc.add_paragraph()
disclaimer_text = text_processor.process_text_content(DISCLAIMER_TEXT)
run = para.add_run(disclaimer_text)
run.font.size = Pt(10)
para.paragraph_format.line_spacing = 1.0
def _cleanup_temp_files(self) -> None:
"""清理临时文件"""
for temp_file in self.temp_files:
try:
if os.path.exists(temp_file):
os.remove(temp_file)
except Exception as e:
print(f"清理临时文件失败 {temp_file}: {e}")
self.temp_files.clear()
# 创建全局DOCX生成器实例
docx_generator = DocxGenerator()
# 兼容旧接口的函数
def generate(sections: List[Dict[str, Any]], image_files: List[str],
output_path: str, progress_callback: Optional[Callable] = None) -> bool:
"""生成DOCX文档兼容旧接口"""
return docx_generator.generate(sections, image_files, output_path, progress_callback)