1635 lines
63 KiB
Python
1635 lines
63 KiB
Python
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import glob
|
|||
|
|
import re
|
|||
|
|
import random
|
|||
|
|
import json
|
|||
|
|
from typing import Tuple, List
|
|||
|
|
from PIL import Image
|
|||
|
|
from docx import Document
|
|||
|
|
from docx.shared import Inches, Pt, RGBColor
|
|||
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|||
|
|
from docx.enum.style import WD_STYLE_TYPE
|
|||
|
|
import PySimpleGUI as sg
|
|||
|
|
from replacestr import replace_text
|
|||
|
|
import configparser
|
|||
|
|
|
|||
|
|
CONFIG_FILE_PATH = os.path.join(os.path.expanduser("~"), ".txt2md2docx.ini")
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 错别字处理功能集成
|
|||
|
|
def load_error_chars(db_path: str = "data/error_chars.json") -> dict:
|
|||
|
|
"""加载错别字库"""
|
|||
|
|
# 检查文件夹是否存在,不存在则创建
|
|||
|
|
dir_name = os.path.dirname(db_path)
|
|||
|
|
if not os.path.exists(dir_name):
|
|||
|
|
os.makedirs(dir_name)
|
|||
|
|
|
|||
|
|
# 检查文件是否存在,不存在则创建默认库
|
|||
|
|
if not os.path.exists(db_path):
|
|||
|
|
default_chars = {
|
|||
|
|
"的": "地",
|
|||
|
|
"地": "得",
|
|||
|
|
"得": "的",
|
|||
|
|
"在": "再",
|
|||
|
|
"再": "在",
|
|||
|
|
"是": "事",
|
|||
|
|
"事": "是",
|
|||
|
|
"他": "她",
|
|||
|
|
"她": "他",
|
|||
|
|
"你": "您",
|
|||
|
|
"您": "你",
|
|||
|
|
"们": "门",
|
|||
|
|
"门": "们",
|
|||
|
|
"有": "又",
|
|||
|
|
"又": "有",
|
|||
|
|
"和": "合",
|
|||
|
|
"合": "和",
|
|||
|
|
"到": "倒",
|
|||
|
|
"倒": "到",
|
|||
|
|
"就": "才",
|
|||
|
|
"才": "就",
|
|||
|
|
"要": "耍",
|
|||
|
|
"耍": "要",
|
|||
|
|
"会": "汇",
|
|||
|
|
"汇": "会",
|
|||
|
|
"看": "着",
|
|||
|
|
"着": "看",
|
|||
|
|
"说": "讲",
|
|||
|
|
"讲": "说",
|
|||
|
|
"做": "作",
|
|||
|
|
"作": "做",
|
|||
|
|
"已": "己",
|
|||
|
|
"己": "已",
|
|||
|
|
"以": "已",
|
|||
|
|
"已": "以",
|
|||
|
|
"进": "近",
|
|||
|
|
"近": "进",
|
|||
|
|
"象": "像",
|
|||
|
|
"像": "象",
|
|||
|
|
"对": "队",
|
|||
|
|
"队": "对",
|
|||
|
|
"分": "份",
|
|||
|
|
"份": "分",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
with open(db_path, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(default_chars, f, ensure_ascii=False, indent=2)
|
|||
|
|
return default_chars
|
|||
|
|
|
|||
|
|
# 加载已存在的错别字库
|
|||
|
|
with open(db_path, 'r', encoding='utf-8') as f:
|
|||
|
|
return json.load(f)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def introduce_char_errors(text: str, intensity: float = 1.0, db_path: str = "data/error_chars.json") -> Tuple[
|
|||
|
|
str, int, List[str], List[str]]:
|
|||
|
|
"""
|
|||
|
|
将文本中的正确单字替换为常见错误单字
|
|||
|
|
|
|||
|
|
参数:
|
|||
|
|
text: 要处理的文本
|
|||
|
|
intensity: 错误引入强度,0.0-1.0之间,1.0表示替换所有可能的字
|
|||
|
|
db_path: 错别字库文件路径
|
|||
|
|
|
|||
|
|
返回:
|
|||
|
|
处理后的文本、替换的总数量、原句列表、处理后的句子列表
|
|||
|
|
"""
|
|||
|
|
# 加载错别字库
|
|||
|
|
error_chars = load_error_chars(db_path)
|
|||
|
|
|
|||
|
|
# 句子拆分函数
|
|||
|
|
def split_into_sentences(txt: str) -> List[str]:
|
|||
|
|
separators = re.compile(r'([。!?;,.!?;])')
|
|||
|
|
parts = separators.split(txt)
|
|||
|
|
sentences = []
|
|||
|
|
for i in range(0, len(parts) - 1, 2):
|
|||
|
|
if parts[i] or parts[i + 1]:
|
|||
|
|
sentences.append(parts[i] + parts[i + 1])
|
|||
|
|
if len(parts) % 2 == 1 and parts[-1]:
|
|||
|
|
sentences.append(parts[-1])
|
|||
|
|
return sentences
|
|||
|
|
|
|||
|
|
# 单句错误引入函数
|
|||
|
|
def introduce_errors_to_sentence(sentence: str) -> Tuple[str, int]:
|
|||
|
|
modified = list(sentence)
|
|||
|
|
replace_count = 0
|
|||
|
|
for i, char in enumerate(modified):
|
|||
|
|
if char in error_chars and random.random() <= intensity:
|
|||
|
|
modified[i] = error_chars[char]
|
|||
|
|
replace_count += 1
|
|||
|
|
return ''.join(modified), replace_count
|
|||
|
|
|
|||
|
|
# 处理整个文本
|
|||
|
|
original_sentences = split_into_sentences(text)
|
|||
|
|
modified_sentences = []
|
|||
|
|
total_replace = 0
|
|||
|
|
|
|||
|
|
for sentence in original_sentences:
|
|||
|
|
modified, count = introduce_errors_to_sentence(sentence)
|
|||
|
|
modified_sentences.append(modified)
|
|||
|
|
total_replace += count
|
|||
|
|
|
|||
|
|
modified_text = ''.join(modified_sentences)
|
|||
|
|
return modified_text, total_replace, original_sentences, modified_sentences
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 配置设置
|
|||
|
|
class Config:
|
|||
|
|
def __init__(self):
|
|||
|
|
# 文件处理配置
|
|||
|
|
self.txt_encoding = "utf-8"
|
|||
|
|
self.match_pattern = "exact" # exact: 完全匹配, prefix: 前缀匹配, contains: 包含
|
|||
|
|
self.output_location = "txt_folder" # txt_folder or custom
|
|||
|
|
# 最近使用的文件夹路径
|
|||
|
|
self.last_txt_folder = ""
|
|||
|
|
self.last_images_root = ""
|
|||
|
|
self.last_output_root = ""
|
|||
|
|
# 文字处理
|
|||
|
|
self.reverse_text_order = False # 转换文字顺序开关
|
|||
|
|
# 错别字处理配置
|
|||
|
|
self.enable_char_errors = False # 是否启用错别字处理
|
|||
|
|
self.char_error_intensity = 0.3 # 错别字强度 0.0-1.0
|
|||
|
|
self.char_error_db_path = "data/error_chars.json" # 错别字库路径
|
|||
|
|
# 图片处理配置
|
|||
|
|
self.image_sort_by = "name" # name or time
|
|||
|
|
self.image_resize = "none" # none or width
|
|||
|
|
self.image_width = 6 # 英寸
|
|||
|
|
self.image_alignment = "center" # left, center, right
|
|||
|
|
self.image_strategy = "cycle" # cycle, truncate, repeat_last
|
|||
|
|
# 文档格式配置
|
|||
|
|
self.line_spacing = 1.5
|
|||
|
|
self.title_levels = 6 # 支持的最大标题层级
|
|||
|
|
self.replace_punctuation = False # 是否替换标点符号
|
|||
|
|
self.add_disclaimer = False # 是否添加免责声明
|
|||
|
|
|
|||
|
|
def load_from_file(self, file_path):
|
|||
|
|
if not os.path.exists(file_path):
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
config_parser = configparser.ConfigParser()
|
|||
|
|
config_parser.read(file_path, encoding='utf-8')
|
|||
|
|
|
|||
|
|
# 加载文件处理配置
|
|||
|
|
if 'FileHandling' in config_parser:
|
|||
|
|
self.txt_encoding = config_parser.get('FileHandling', 'txt_encoding', fallback=self.txt_encoding)
|
|||
|
|
self.match_pattern = config_parser.get('FileHandling', 'match_pattern', fallback=self.match_pattern)
|
|||
|
|
self.output_location = config_parser.get('FileHandling', 'output_location',
|
|||
|
|
fallback=self.output_location)
|
|||
|
|
self.last_txt_folder = config_parser.get('FileHandling', 'last_txt_folder',
|
|||
|
|
fallback=self.last_txt_folder)
|
|||
|
|
self.last_images_root = config_parser.get('FileHandling', 'last_images_root',
|
|||
|
|
fallback=self.last_images_root)
|
|||
|
|
self.last_output_root = config_parser.get('FileHandling', 'last_output_root',
|
|||
|
|
fallback=self.last_output_root)
|
|||
|
|
|
|||
|
|
# 加载文字处理配置
|
|||
|
|
if 'TextProcessing' in config_parser:
|
|||
|
|
self.reverse_text_order = config_parser.getboolean('TextProcessing', 'reverse_text_order',
|
|||
|
|
fallback=self.reverse_text_order)
|
|||
|
|
self.replace_punctuation = config_parser.getboolean('TextProcessing', 'replace_punctuation',
|
|||
|
|
fallback=self.replace_punctuation)
|
|||
|
|
self.add_disclaimer = config_parser.getboolean('TextProcessing', 'add_disclaimer',
|
|||
|
|
fallback=self.add_disclaimer)
|
|||
|
|
# 错别字处理配置
|
|||
|
|
self.enable_char_errors = config_parser.getboolean('TextProcessing', 'enable_char_errors',
|
|||
|
|
fallback=self.enable_char_errors)
|
|||
|
|
self.char_error_intensity = config_parser.getfloat('TextProcessing', 'char_error_intensity',
|
|||
|
|
fallback=self.char_error_intensity)
|
|||
|
|
self.char_error_db_path = config_parser.get('TextProcessing', 'char_error_db_path',
|
|||
|
|
fallback=self.char_error_db_path)
|
|||
|
|
|
|||
|
|
# 加载图片处理配置
|
|||
|
|
if 'ImageProcessing' in config_parser:
|
|||
|
|
self.image_sort_by = config_parser.get('ImageProcessing', 'image_sort_by', fallback=self.image_sort_by)
|
|||
|
|
self.image_resize = config_parser.get('ImageProcessing', 'image_resize', fallback=self.image_resize)
|
|||
|
|
self.image_width = config_parser.getfloat('ImageProcessing', 'image_width', fallback=self.image_width)
|
|||
|
|
self.image_alignment = config_parser.get('ImageProcessing', 'image_alignment',
|
|||
|
|
fallback=self.image_alignment)
|
|||
|
|
self.image_strategy = config_parser.get('ImageProcessing', 'image_strategy',
|
|||
|
|
fallback=self.image_strategy)
|
|||
|
|
|
|||
|
|
# 加载文档格式配置
|
|||
|
|
if 'DocumentFormat' in config_parser:
|
|||
|
|
self.line_spacing = config_parser.getfloat('DocumentFormat', 'line_spacing', fallback=self.line_spacing)
|
|||
|
|
self.title_levels = config_parser.getint('DocumentFormat', 'title_levels', fallback=self.title_levels)
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
def save_to_file(self, file_path):
|
|||
|
|
config_parser = configparser.ConfigParser()
|
|||
|
|
|
|||
|
|
# 保存文件处理配置
|
|||
|
|
config_parser['FileHandling'] = {
|
|||
|
|
'txt_encoding': self.txt_encoding,
|
|||
|
|
'match_pattern': self.match_pattern,
|
|||
|
|
'output_location': self.output_location,
|
|||
|
|
'last_txt_folder': self.last_txt_folder,
|
|||
|
|
'last_images_root': self.last_images_root,
|
|||
|
|
'last_output_root': self.last_output_root
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 保存文字处理配置
|
|||
|
|
config_parser['TextProcessing'] = {
|
|||
|
|
'reverse_text_order': str(self.reverse_text_order),
|
|||
|
|
'replace_punctuation': str(self.replace_punctuation),
|
|||
|
|
'add_disclaimer': str(self.add_disclaimer),
|
|||
|
|
'enable_char_errors': str(self.enable_char_errors),
|
|||
|
|
'char_error_intensity': str(self.char_error_intensity),
|
|||
|
|
'char_error_db_path': self.char_error_db_path
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 保存图片处理配置
|
|||
|
|
config_parser['ImageProcessing'] = {
|
|||
|
|
'image_sort_by': self.image_sort_by,
|
|||
|
|
'image_resize': self.image_resize,
|
|||
|
|
'image_width': str(self.image_width),
|
|||
|
|
'image_alignment': self.image_alignment,
|
|||
|
|
'image_strategy': self.image_strategy
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 保存文档格式配置
|
|||
|
|
config_parser['DocumentFormat'] = {
|
|||
|
|
'line_spacing': str(self.line_spacing),
|
|||
|
|
'title_levels': str(self.title_levels)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|||
|
|
config_parser.write(f)
|
|||
|
|
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 全局配置实例
|
|||
|
|
config = Config()
|
|||
|
|
config.load_from_file(CONFIG_FILE_PATH)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 文字处理工具类 - 增强功能
|
|||
|
|
class TextProcessor:
|
|||
|
|
@staticmethod
|
|||
|
|
def replace_periods(text: str) -> str:
|
|||
|
|
"""
|
|||
|
|
将中间出现的句号统一替换为逗号;
|
|||
|
|
若文本末尾是句号,则直接删除该句号。
|
|||
|
|
"""
|
|||
|
|
text = text.rstrip()
|
|||
|
|
if not text:
|
|||
|
|
return ''
|
|||
|
|
|
|||
|
|
# 去掉末尾句号(如果有)
|
|||
|
|
if text[-1] == '。':
|
|||
|
|
text = text[:-1]
|
|||
|
|
|
|||
|
|
# 把剩余句号替换为逗号
|
|||
|
|
return text.replace('。', ',')
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def reverse_text_order(content):
|
|||
|
|
"""反转文本顺序(按字符级反转)"""
|
|||
|
|
if not content:
|
|||
|
|
return content
|
|||
|
|
return content[::-1]
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def reverse_paragraph_order(content):
|
|||
|
|
"""反转段落顺序(保留段落内文字顺序)"""
|
|||
|
|
if not content:
|
|||
|
|
return content
|
|||
|
|
paragraphs = content.split('\n')
|
|||
|
|
return '\n'.join(reversed(paragraphs))
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def apply_char_errors(text: str) -> str:
|
|||
|
|
"""应用错别字处理"""
|
|||
|
|
if not config.enable_char_errors or not text:
|
|||
|
|
return text
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
modified_text, replace_count, _, _ = introduce_char_errors(
|
|||
|
|
text,
|
|||
|
|
config.char_error_intensity,
|
|||
|
|
config.char_error_db_path
|
|||
|
|
)
|
|||
|
|
print(f"已应用错别字处理,替换了 {replace_count} 个字符。")
|
|||
|
|
return modified_text
|
|||
|
|
except Exception as e:
|
|||
|
|
# 如果错别字处理出错,返回原文本
|
|||
|
|
print(f"错别字处理出错: {e}")
|
|||
|
|
return text
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def process_text_content(text):
|
|||
|
|
"""统一处理文字内容:顺序调换、错别字处理和标点符号替换"""
|
|||
|
|
if not text or not text.strip():
|
|||
|
|
return text
|
|||
|
|
|
|||
|
|
# 先进行文字顺序处理
|
|||
|
|
if config.reverse_text_order:
|
|||
|
|
text = replace_text(text)
|
|||
|
|
|
|||
|
|
# 应用错别字处理
|
|||
|
|
text = TextProcessor.apply_char_errors(text)
|
|||
|
|
|
|||
|
|
# 最后进行标点符号替换
|
|||
|
|
if config.replace_punctuation:
|
|||
|
|
text = TextProcessor.replace_periods(text)
|
|||
|
|
|
|||
|
|
return text
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 增强的Markdown解析器
|
|||
|
|
class MarkdownParser:
|
|||
|
|
# Markdown格式匹配模式
|
|||
|
|
PATTERNS = {
|
|||
|
|
'heading': re.compile(r'^(\s*)(#{1,6})\s+(.+)$'),
|
|||
|
|
'bold_asterisk': re.compile(r'\*\*(.+?)\*\*'),
|
|||
|
|
'bold_underscore': re.compile(r'__(.+?)__'),
|
|||
|
|
'italic_asterisk': re.compile(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)'),
|
|||
|
|
'italic_underscore': re.compile(r'_(.+?)_'),
|
|||
|
|
'code_inline': re.compile(r'`([^`]+)`'),
|
|||
|
|
'code_block': re.compile(r'^```(\w+)?\s*\n(.*?)\n```', re.MULTILINE | re.DOTALL),
|
|||
|
|
'strikethrough': re.compile(r'~~(.+?)~~'),
|
|||
|
|
'link': re.compile(r'\[([^\]]+)\]\(([^)]+)\)'),
|
|||
|
|
'image': re.compile(r'!\[([^\]]*)\]\(([^)]+)\)'),
|
|||
|
|
'unordered_list': re.compile(r'^\s*[-*+]\s+(.+)$'),
|
|||
|
|
'ordered_list': re.compile(r'^\s*\d+\.\s+(.+)$'),
|
|||
|
|
'blockquote': re.compile(r'^\s*>\s*(.+)$'),
|
|||
|
|
'horizontal_rule': re.compile(r'^(\s*[-*_]){3,}\s*$'),
|
|||
|
|
'table_row': re.compile(r'^\|(.+)\|$'),
|
|||
|
|
'table_separator': re.compile(r'^\|(\s*:?-+:?\s*\|)+$')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def parse(txt_content):
|
|||
|
|
"""解析Markdown内容为结构化数据"""
|
|||
|
|
elements = []
|
|||
|
|
lines = txt_content.split('\n')
|
|||
|
|
i = 0
|
|||
|
|
current_section = None
|
|||
|
|
in_code_block = False
|
|||
|
|
code_block_content = []
|
|||
|
|
table_mode = False
|
|||
|
|
table_rows = []
|
|||
|
|
|
|||
|
|
while i < len(lines):
|
|||
|
|
line = lines[i].rstrip('\r')
|
|||
|
|
original_line = line
|
|||
|
|
|
|||
|
|
# 处理代码块
|
|||
|
|
if line.strip().startswith('```'):
|
|||
|
|
if not in_code_block:
|
|||
|
|
in_code_block = True
|
|||
|
|
language = line.strip()[3:].strip()
|
|||
|
|
code_block_content = []
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
else:
|
|||
|
|
in_code_block = False
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'code_block',
|
|||
|
|
'language': language if 'language' in locals() else '',
|
|||
|
|
'content': '\n'.join(code_block_content),
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
code_block_content = []
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if in_code_block:
|
|||
|
|
code_block_content.append(line)
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 处理表格
|
|||
|
|
table_match = MarkdownParser.PATTERNS['table_row'].match(line)
|
|||
|
|
table_sep_match = MarkdownParser.PATTERNS['table_separator'].match(line)
|
|||
|
|
|
|||
|
|
if table_match or table_sep_match:
|
|||
|
|
if not table_mode:
|
|||
|
|
table_mode = True
|
|||
|
|
table_rows = []
|
|||
|
|
|
|||
|
|
if table_match and not table_sep_match:
|
|||
|
|
cells = [cell.strip() for cell in table_match.group(1).split('|')]
|
|||
|
|
table_rows.append(cells)
|
|||
|
|
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
elif table_mode:
|
|||
|
|
# 表格结束
|
|||
|
|
if table_rows:
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'table',
|
|||
|
|
'rows': table_rows,
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
table_mode = False
|
|||
|
|
table_rows = []
|
|||
|
|
|
|||
|
|
# 处理标题
|
|||
|
|
heading_match = MarkdownParser.PATTERNS['heading'].match(line)
|
|||
|
|
if heading_match:
|
|||
|
|
level = len(heading_match.group(2))
|
|||
|
|
if level <= config.title_levels:
|
|||
|
|
# 提取标题文本(可能包含粗体等格式)
|
|||
|
|
heading_text = heading_match.group(3).strip()
|
|||
|
|
# 先移除Markdown标记但保留文本内容
|
|||
|
|
cleaned_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', heading_text)
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'heading',
|
|||
|
|
'level': level,
|
|||
|
|
'content': heading_text, # 保留原始内容用于格式处理
|
|||
|
|
'cleaned_content': cleaned_text # 用于显示的纯文本
|
|||
|
|
})
|
|||
|
|
current_section = elements[-1]
|
|||
|
|
current_section['paragraphs'] = []
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 处理水平分隔线
|
|||
|
|
if MarkdownParser.PATTERNS['horizontal_rule'].match(line):
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'horizontal_rule',
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 处理列表
|
|||
|
|
ul_match = MarkdownParser.PATTERNS['unordered_list'].match(line)
|
|||
|
|
ol_match = MarkdownParser.PATTERNS['ordered_list'].match(line)
|
|||
|
|
|
|||
|
|
if ul_match:
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'unordered_list',
|
|||
|
|
'content': ul_match.group(1),
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if ol_match:
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'ordered_list',
|
|||
|
|
'content': ol_match.group(1),
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 处理引用
|
|||
|
|
quote_match = MarkdownParser.PATTERNS['blockquote'].match(line)
|
|||
|
|
if quote_match:
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'blockquote',
|
|||
|
|
'content': quote_match.group(1),
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 处理空行
|
|||
|
|
if line.strip() == '':
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'empty',
|
|||
|
|
'content': '',
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
i += 1
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 处理普通段落
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'paragraph',
|
|||
|
|
'content': line,
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
i += 1
|
|||
|
|
|
|||
|
|
# 处理剩余的表格
|
|||
|
|
if table_mode and table_rows:
|
|||
|
|
elements.append({
|
|||
|
|
'type': 'table',
|
|||
|
|
'rows': table_rows,
|
|||
|
|
'level': 0
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
return MarkdownParser.group_by_sections(elements)
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def group_by_sections(elements):
|
|||
|
|
"""将解析的元素按标题分组"""
|
|||
|
|
sections = []
|
|||
|
|
current_section = {
|
|||
|
|
'type': 'section',
|
|||
|
|
'level': 0,
|
|||
|
|
'content': '前置内容',
|
|||
|
|
'elements': []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for element in elements:
|
|||
|
|
if element['type'] == 'heading':
|
|||
|
|
# 保存当前section
|
|||
|
|
if current_section['elements']:
|
|||
|
|
sections.append(current_section)
|
|||
|
|
|
|||
|
|
# 创建新section
|
|||
|
|
current_section = {
|
|||
|
|
'type': 'section',
|
|||
|
|
'level': element['level'],
|
|||
|
|
'content': element['content'],
|
|||
|
|
'elements': []
|
|||
|
|
}
|
|||
|
|
else:
|
|||
|
|
current_section['elements'].append(element)
|
|||
|
|
|
|||
|
|
# 添加最后一个section
|
|||
|
|
if current_section['elements']:
|
|||
|
|
sections.append(current_section)
|
|||
|
|
|
|||
|
|
return sections
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def extract_inline_formatting(text):
|
|||
|
|
"""提取行内格式信息"""
|
|||
|
|
formatting = []
|
|||
|
|
|
|||
|
|
# 提取粗体 (**)
|
|||
|
|
for match in MarkdownParser.PATTERNS['bold_asterisk'].finditer(text):
|
|||
|
|
formatting.append({
|
|||
|
|
'type': 'bold',
|
|||
|
|
'start': match.start(),
|
|||
|
|
'end': match.end(),
|
|||
|
|
'content': match.group(1)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 提取粗体 (__)
|
|||
|
|
for match in MarkdownParser.PATTERNS['bold_underscore'].finditer(text):
|
|||
|
|
formatting.append({
|
|||
|
|
'type': 'bold',
|
|||
|
|
'start': match.start(),
|
|||
|
|
'end': match.end(),
|
|||
|
|
'content': match.group(1)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 提取斜体 (*)
|
|||
|
|
for match in MarkdownParser.PATTERNS['italic_asterisk'].finditer(text):
|
|||
|
|
# 检查是否与粗体重叠
|
|||
|
|
overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end']
|
|||
|
|
for f in formatting if f['type'] == 'bold')
|
|||
|
|
if not overlaps:
|
|||
|
|
formatting.append({
|
|||
|
|
'type': 'italic',
|
|||
|
|
'start': match.start(),
|
|||
|
|
'end': match.end(),
|
|||
|
|
'content': match.group(1)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 提取斜体 (_)
|
|||
|
|
for match in MarkdownParser.PATTERNS['italic_underscore'].finditer(text):
|
|||
|
|
overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end']
|
|||
|
|
for f in formatting if f['type'] in ['bold', 'italic'])
|
|||
|
|
if not overlaps:
|
|||
|
|
formatting.append({
|
|||
|
|
'type': 'italic',
|
|||
|
|
'start': match.start(),
|
|||
|
|
'end': match.end(),
|
|||
|
|
'content': match.group(1)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 提取行内代码
|
|||
|
|
for match in MarkdownParser.PATTERNS['code_inline'].finditer(text):
|
|||
|
|
formatting.append({
|
|||
|
|
'type': 'code',
|
|||
|
|
'start': match.start(),
|
|||
|
|
'end': match.end(),
|
|||
|
|
'content': match.group(1)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 提取删除线
|
|||
|
|
for match in MarkdownParser.PATTERNS['strikethrough'].finditer(text):
|
|||
|
|
formatting.append({
|
|||
|
|
'type': 'strikethrough',
|
|||
|
|
'start': match.start(),
|
|||
|
|
'end': match.end(),
|
|||
|
|
'content': match.group(1)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 提取链接
|
|||
|
|
for match in MarkdownParser.PATTERNS['link'].finditer(text):
|
|||
|
|
formatting.append({
|
|||
|
|
'type': 'link',
|
|||
|
|
'start': match.start(),
|
|||
|
|
'end': match.end(),
|
|||
|
|
'text': match.group(1),
|
|||
|
|
'url': match.group(2)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 按位置排序
|
|||
|
|
formatting.sort(key=lambda x: x['start'])
|
|||
|
|
return formatting
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 文件处理模块
|
|||
|
|
class FileHandler:
|
|||
|
|
@staticmethod
|
|||
|
|
def scan_txt_files(folder_path):
|
|||
|
|
"""扫描文件夹中的所有TXT文件"""
|
|||
|
|
if not os.path.isdir(folder_path):
|
|||
|
|
raise Exception(f"TXT文件夹不存在: {folder_path}")
|
|||
|
|
|
|||
|
|
txt_files = []
|
|||
|
|
for root, dirs, files in os.walk(folder_path):
|
|||
|
|
for file in files:
|
|||
|
|
if file.lower().endswith(".txt"):
|
|||
|
|
txt_path = os.path.join(root, file)
|
|||
|
|
file_name = os.path.splitext(file)[0]
|
|||
|
|
txt_files.append({
|
|||
|
|
"path": txt_path,
|
|||
|
|
"name": file_name,
|
|||
|
|
"relative_path": os.path.relpath(txt_path, folder_path),
|
|||
|
|
"folder": root
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
if not txt_files:
|
|||
|
|
raise Exception(f"在 {folder_path} 中未找到任何TXT文件")
|
|||
|
|
|
|||
|
|
return sorted(txt_files, key=lambda x: x["relative_path"])
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def find_matching_image_folders(txt_files, images_root):
|
|||
|
|
"""根据TXT文件名匹配图片文件夹"""
|
|||
|
|
if not os.path.isdir(images_root):
|
|||
|
|
raise Exception(f"图片根文件夹不存在: {images_root}")
|
|||
|
|
|
|||
|
|
all_image_folders = []
|
|||
|
|
for root, dirs, _ in os.walk(images_root):
|
|||
|
|
for dir in dirs:
|
|||
|
|
folder_path = os.path.join(root, dir)
|
|||
|
|
all_image_folders.append({
|
|||
|
|
"path": folder_path,
|
|||
|
|
"name": dir,
|
|||
|
|
"relative_path": os.path.relpath(folder_path, images_root)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
matched_pairs = []
|
|||
|
|
for txt in txt_files:
|
|||
|
|
matches = []
|
|||
|
|
txt_name = txt["name"].lower()
|
|||
|
|
|
|||
|
|
for img_folder in all_image_folders:
|
|||
|
|
folder_name = img_folder["name"].lower()
|
|||
|
|
|
|||
|
|
if config.match_pattern == "exact" and txt_name == folder_name:
|
|||
|
|
matches.append(img_folder)
|
|||
|
|
elif config.match_pattern == "prefix" and folder_name.startswith(txt_name):
|
|||
|
|
matches.append(img_folder)
|
|||
|
|
elif config.match_pattern == "contains" and txt_name in folder_name:
|
|||
|
|
matches.append(img_folder)
|
|||
|
|
|
|||
|
|
if matches:
|
|||
|
|
matches.sort(key=lambda x: len(x["relative_path"]))
|
|||
|
|
matched_pairs.append({
|
|||
|
|
"txt": txt,
|
|||
|
|
"image_folder": matches[0],
|
|||
|
|
"all_matches": matches
|
|||
|
|
})
|
|||
|
|
else:
|
|||
|
|
matched_pairs.append({
|
|||
|
|
"txt": txt,
|
|||
|
|
"image_folder": None,
|
|||
|
|
"all_matches": []
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
return matched_pairs
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def get_image_files(folder_path):
|
|||
|
|
"""获取文件夹中的所有图片文件"""
|
|||
|
|
if not folder_path or not os.path.isdir(folder_path):
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.gif', '*.webp', '*.tiff']
|
|||
|
|
image_files = []
|
|||
|
|
|
|||
|
|
for ext in image_extensions:
|
|||
|
|
image_files.extend(glob.glob(os.path.join(folder_path, ext)))
|
|||
|
|
|
|||
|
|
if config.image_sort_by == "name":
|
|||
|
|
image_files.sort()
|
|||
|
|
elif config.image_sort_by == "time":
|
|||
|
|
image_files.sort(key=lambda x: os.path.getmtime(x))
|
|||
|
|
|
|||
|
|
return image_files
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def read_markdown_txt(file_path):
|
|||
|
|
"""读取含Markdown内容的TXT文件"""
|
|||
|
|
if not os.path.exists(file_path):
|
|||
|
|
raise Exception(f"TXT文件不存在: {file_path}")
|
|||
|
|
|
|||
|
|
encodings = [config.txt_encoding, "gbk", "utf-16", "iso-8859-1"]
|
|||
|
|
for encoding in encodings:
|
|||
|
|
try:
|
|||
|
|
with open(file_path, 'r', encoding=encoding) as f:
|
|||
|
|
content = f.read()
|
|||
|
|
content = content.replace("\r\n", "\n").replace("\r", "\n")
|
|||
|
|
return content
|
|||
|
|
except UnicodeDecodeError:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
raise Exception(f"无法解析TXT文件(编码问题): {file_path}")
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def prepare_output_path(txt_info, images_root, output_root):
|
|||
|
|
"""准备输出文件路径"""
|
|||
|
|
if config.output_location == "txt_folder":
|
|||
|
|
base_folder = txt_info["folder"]
|
|||
|
|
else:
|
|||
|
|
base_folder = output_root
|
|||
|
|
|
|||
|
|
os.makedirs(base_folder, exist_ok=True)
|
|||
|
|
|
|||
|
|
txt_name = txt_info["name"]
|
|||
|
|
output_path = os.path.join(base_folder, f"{txt_name}.docx")
|
|||
|
|
|
|||
|
|
counter = 1
|
|||
|
|
while os.path.exists(output_path):
|
|||
|
|
output_path = os.path.join(base_folder, f"{txt_name}_{counter}.docx")
|
|||
|
|
counter += 1
|
|||
|
|
|
|||
|
|
return output_path
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 图片处理模块
|
|||
|
|
class ImageProcessor:
|
|||
|
|
@staticmethod
|
|||
|
|
def process_image(image_path):
|
|||
|
|
"""处理图片"""
|
|||
|
|
try:
|
|||
|
|
with Image.open(image_path) as img:
|
|||
|
|
# 处理图片方向
|
|||
|
|
if hasattr(img, '_getexif'):
|
|||
|
|
exif = img._getexif()
|
|||
|
|
if exif:
|
|||
|
|
orientation_tag = 274
|
|||
|
|
if orientation_tag in exif:
|
|||
|
|
orientation = exif[orientation_tag]
|
|||
|
|
if orientation == 3:
|
|||
|
|
img = img.rotate(180, expand=True)
|
|||
|
|
elif orientation == 6:
|
|||
|
|
img = img.rotate(270, expand=True)
|
|||
|
|
elif orientation == 8:
|
|||
|
|
img = img.rotate(90, expand=True)
|
|||
|
|
|
|||
|
|
# 调整大小
|
|||
|
|
if config.image_resize == "width" and config.image_width > 0:
|
|||
|
|
target_width_px = config.image_width * 96
|
|||
|
|
width, height = img.size
|
|||
|
|
|
|||
|
|
if width > target_width_px:
|
|||
|
|
ratio = target_width_px / width
|
|||
|
|
new_height = int(height * ratio)
|
|||
|
|
img = img.resize((int(target_width_px), new_height), Image.LANCZOS)
|
|||
|
|
|
|||
|
|
return img, config.image_width
|
|||
|
|
else:
|
|||
|
|
width_in = img.width / 96
|
|||
|
|
return img, width_in
|
|||
|
|
except Exception as e:
|
|||
|
|
raise Exception(f"处理图片失败 {image_path}: {str(e)}")
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def get_image_alignment():
|
|||
|
|
"""获取图片对齐方式"""
|
|||
|
|
if config.image_alignment == "left":
|
|||
|
|
return WD_ALIGN_PARAGRAPH.LEFT
|
|||
|
|
elif config.image_alignment == "right":
|
|||
|
|
return WD_ALIGN_PARAGRAPH.RIGHT
|
|||
|
|
else:
|
|||
|
|
return WD_ALIGN_PARAGRAPH.CENTER
|
|||
|
|
|
|||
|
|
|
|||
|
|
DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
# DOCX生成模块 - 完全重构
|
|||
|
|
class DocxGenerator:
|
|||
|
|
@staticmethod
|
|||
|
|
def generate(sections, image_files, output_path, progress_callback=None):
|
|||
|
|
"""生成DOCX文档 - 重构版本"""
|
|||
|
|
doc = Document()
|
|||
|
|
total_sections = len(sections)
|
|||
|
|
image_index = 0
|
|||
|
|
image_count = len(image_files)
|
|||
|
|
|
|||
|
|
for i, section in enumerate(sections):
|
|||
|
|
if progress_callback:
|
|||
|
|
progress = int((i / total_sections) * 100)
|
|||
|
|
progress_callback(progress, f"处理章节: {section['content'][:30]}...")
|
|||
|
|
|
|||
|
|
# 添加标题
|
|||
|
|
if section['level'] > 0 and section['level'] <= config.title_levels:
|
|||
|
|
# 使用原始带格式的内容进行处理
|
|||
|
|
heading_text = TextProcessor.process_text_content(section['content'])
|
|||
|
|
# 创建标题段落
|
|||
|
|
para = doc.add_heading(level=section['level'])
|
|||
|
|
# 应用行内格式(包括粗体)
|
|||
|
|
DocxGenerator.apply_inline_formatting(para, heading_text)
|
|||
|
|
elif section['content'] != '前置内容':
|
|||
|
|
heading_text = TextProcessor.process_text_content(section['content'])
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
run = para.add_run(heading_text)
|
|||
|
|
run.font.size = Pt(14)
|
|||
|
|
run.font.bold = True
|
|||
|
|
para.space_after = Pt(12)
|
|||
|
|
|
|||
|
|
# 处理章节中的元素
|
|||
|
|
elements = section.get('elements', [])
|
|||
|
|
if not elements:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 处理第一个非空元素后插入图片
|
|||
|
|
first_content_added = False
|
|||
|
|
|
|||
|
|
for element in elements:
|
|||
|
|
# 添加元素到文档
|
|||
|
|
DocxGenerator.add_element_to_doc(doc, element)
|
|||
|
|
|
|||
|
|
# 在第一个内容元素后插入图片
|
|||
|
|
if not first_content_added and element['type'] not in ['empty']:
|
|||
|
|
first_content_added = True
|
|||
|
|
|
|||
|
|
# 插入图片
|
|||
|
|
if image_count > 0 and image_index < image_count:
|
|||
|
|
try:
|
|||
|
|
DocxGenerator.insert_image(doc, image_files[image_index], output_path)
|
|||
|
|
image_index += 1
|
|||
|
|
|
|||
|
|
if image_index >= image_count:
|
|||
|
|
if config.image_strategy == "cycle":
|
|||
|
|
image_index = 0
|
|||
|
|
elif config.image_strategy == "truncate":
|
|||
|
|
image_index = image_count
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
doc.add_paragraph(f"[图片插入失败: {str(e)}]")
|
|||
|
|
|
|||
|
|
# 添加免责声明
|
|||
|
|
if config.add_disclaimer:
|
|||
|
|
doc.add_paragraph("---")
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
disclaimer_text = TextProcessor.process_text_content(DISCLAIMER_TEXT)
|
|||
|
|
run = para.add_run(disclaimer_text)
|
|||
|
|
run.font.size = Pt(10)
|
|||
|
|
para.paragraph_format.line_spacing = 1.0
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
doc.save(output_path)
|
|||
|
|
if progress_callback:
|
|||
|
|
progress_callback(100, "转换完成!")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
raise Exception(f"保存DOCX失败: {str(e)}")
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def add_element_to_doc(doc, element):
|
|||
|
|
"""将解析的元素添加到文档中"""
|
|||
|
|
etype = element['type']
|
|||
|
|
content = TextProcessor.process_text_content(element.get('content', ''))
|
|||
|
|
|
|||
|
|
if etype == 'paragraph':
|
|||
|
|
DocxGenerator.add_formatted_paragraph(doc, content)
|
|||
|
|
|
|||
|
|
elif etype == 'unordered_list':
|
|||
|
|
para = doc.add_paragraph(style='List Bullet')
|
|||
|
|
DocxGenerator.apply_inline_formatting(para, content)
|
|||
|
|
|
|||
|
|
elif etype == 'ordered_list':
|
|||
|
|
para = doc.add_paragraph(style='List Number')
|
|||
|
|
DocxGenerator.apply_inline_formatting(para, content)
|
|||
|
|
|
|||
|
|
elif etype == 'blockquote':
|
|||
|
|
para = doc.add_paragraph(style='Quote')
|
|||
|
|
DocxGenerator.apply_inline_formatting(para, content)
|
|||
|
|
|
|||
|
|
elif etype == 'code_block':
|
|||
|
|
para = doc.add_paragraph(style='No Spacing')
|
|||
|
|
run = para.add_run(element['content'])
|
|||
|
|
run.font.name = 'Courier New'
|
|||
|
|
run.font.size = Pt(10)
|
|||
|
|
|
|||
|
|
elif etype == 'table':
|
|||
|
|
DocxGenerator.add_table_to_doc(doc, element['rows'])
|
|||
|
|
|
|||
|
|
elif etype == 'horizontal_rule':
|
|||
|
|
DocxGenerator.add_horizontal_rule(doc)
|
|||
|
|
|
|||
|
|
elif etype == 'empty':
|
|||
|
|
doc.add_paragraph()
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def add_horizontal_rule(doc):
|
|||
|
|
"""在文档中添加横线"""
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
run = para.add_run()
|
|||
|
|
# 添加水平线条(使用下划线作为横线)
|
|||
|
|
run.font.underline = True
|
|||
|
|
run.text = " " * 100 # 足够长的下划线作为横线
|
|||
|
|
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def add_table_to_doc(doc, rows):
|
|||
|
|
"""添加表格到文档"""
|
|||
|
|
if not rows:
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
|
|||
|
|
table.style = 'Table Grid'
|
|||
|
|
|
|||
|
|
for i, row_data in enumerate(rows):
|
|||
|
|
row_cells = table.rows[i].cells
|
|||
|
|
for j, cell_data in enumerate(row_data):
|
|||
|
|
if j < len(row_cells):
|
|||
|
|
# 处理单元格内容的格式和文字处理
|
|||
|
|
processed_text = TextProcessor.process_text_content(cell_data)
|
|||
|
|
row_cells[j].text = processed_text
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def insert_image(doc, image_path, output_path):
|
|||
|
|
"""插入图片到文档"""
|
|||
|
|
img, width = ImageProcessor.process_image(image_path)
|
|||
|
|
|
|||
|
|
temp_img_path = None
|
|||
|
|
if config.image_resize == "width":
|
|||
|
|
temp_dir = os.path.dirname(output_path)
|
|||
|
|
os.makedirs(temp_dir, exist_ok=True)
|
|||
|
|
temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png")
|
|||
|
|
img.save(temp_img_path)
|
|||
|
|
img_path = temp_img_path
|
|||
|
|
else:
|
|||
|
|
img_path = image_path
|
|||
|
|
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
run = para.runs[0] if para.runs else para.add_run()
|
|||
|
|
run.add_picture(img_path, width=Inches(width))
|
|||
|
|
para.alignment = ImageProcessor.get_image_alignment()
|
|||
|
|
|
|||
|
|
if temp_img_path and os.path.exists(temp_img_path):
|
|||
|
|
try:
|
|||
|
|
os.remove(temp_img_path)
|
|||
|
|
except:
|
|||
|
|
pass # 忽略删除临时文件的错误
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def add_formatted_paragraph(doc, content):
|
|||
|
|
"""添加带格式的段落"""
|
|||
|
|
if not content or not content.strip():
|
|||
|
|
doc.add_paragraph()
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
para = doc.add_paragraph()
|
|||
|
|
DocxGenerator.apply_inline_formatting(para, content)
|
|||
|
|
|
|||
|
|
if config.line_spacing > 0:
|
|||
|
|
para.paragraph_format.line_spacing = config.line_spacing
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def apply_inline_formatting(paragraph, text):
|
|||
|
|
"""应用行内格式到段落"""
|
|||
|
|
# 首先处理文字内容(顺序调换、错别字和标点符号替换)
|
|||
|
|
processed_text = TextProcessor.process_text_content(text)
|
|||
|
|
|
|||
|
|
# 重新提取格式信息(因为文字可能已经改变)
|
|||
|
|
formatting = MarkdownParser.extract_inline_formatting(processed_text)
|
|||
|
|
|
|||
|
|
# 如果没有格式,直接添加文本
|
|||
|
|
if not formatting:
|
|||
|
|
paragraph.add_run(processed_text)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
current_pos = 0
|
|||
|
|
|
|||
|
|
for fmt in formatting:
|
|||
|
|
# 添加格式前的普通文本
|
|||
|
|
if fmt['start'] > current_pos:
|
|||
|
|
paragraph.add_run(processed_text[current_pos:fmt['start']])
|
|||
|
|
|
|||
|
|
# 创建格式化的run
|
|||
|
|
if fmt['type'] == 'bold':
|
|||
|
|
# 移除markdown标记并应用格式
|
|||
|
|
clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']])
|
|||
|
|
run = paragraph.add_run(clean_text)
|
|||
|
|
run.bold = True
|
|||
|
|
|
|||
|
|
elif fmt['type'] == 'italic':
|
|||
|
|
clean_text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)|_(.+?)_', r'\1\2',
|
|||
|
|
processed_text[fmt['start']:fmt['end']])
|
|||
|
|
run = paragraph.add_run(clean_text)
|
|||
|
|
run.italic = True
|
|||
|
|
|
|||
|
|
elif fmt['type'] == 'code':
|
|||
|
|
clean_text = re.sub(r'`([^`]+)`', r'\1', processed_text[fmt['start']:fmt['end']])
|
|||
|
|
run = paragraph.add_run(clean_text)
|
|||
|
|
run.font.name = 'Courier New'
|
|||
|
|
run.font.size = Pt(10)
|
|||
|
|
|
|||
|
|
elif fmt['type'] == 'strikethrough':
|
|||
|
|
clean_text = re.sub(r'~~(.+?)~~', r'\1', processed_text[fmt['start']:fmt['end']])
|
|||
|
|
run = paragraph.add_run(clean_text)
|
|||
|
|
run.font.strike = True
|
|||
|
|
|
|||
|
|
elif fmt['type'] == 'link':
|
|||
|
|
# 对于链接,只显示链接文本
|
|||
|
|
run = paragraph.add_run(fmt['text'])
|
|||
|
|
run.font.color.rgb = RGBColor(0, 0, 255) # 蓝色
|
|||
|
|
run.underline = True
|
|||
|
|
|
|||
|
|
current_pos = fmt['end']
|
|||
|
|
|
|||
|
|
# 添加剩余的普通文本
|
|||
|
|
if current_pos < len(processed_text):
|
|||
|
|
paragraph.add_run(processed_text[current_pos:])
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 批量处理模块
|
|||
|
|
class BatchProcessor:
|
|||
|
|
@staticmethod
|
|||
|
|
def process_batch(matched_pairs, output_root, progress_callback=None):
|
|||
|
|
"""批量处理匹配的文件对"""
|
|||
|
|
total = len(matched_pairs)
|
|||
|
|
success_count = 0
|
|||
|
|
failed_items = []
|
|||
|
|
|
|||
|
|
for i, pair in enumerate(matched_pairs):
|
|||
|
|
try:
|
|||
|
|
if progress_callback:
|
|||
|
|
overall_progress = int((i / total) * 100)
|
|||
|
|
progress_callback(overall_progress,
|
|||
|
|
f"处理 {i + 1}/{total}: {pair['txt']['name']}")
|
|||
|
|
|
|||
|
|
# 准备输出路径
|
|||
|
|
output_path = FileHandler.prepare_output_path(
|
|||
|
|
pair['txt'],
|
|||
|
|
pair['image_folder']['path'] if pair['image_folder'] else "",
|
|||
|
|
output_root
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 读取TXT内容
|
|||
|
|
txt_content = FileHandler.read_markdown_txt(pair['txt']['path'])
|
|||
|
|
|
|||
|
|
# 解析内容为结构化数据
|
|||
|
|
sections = MarkdownParser.parse(txt_content)
|
|||
|
|
|
|||
|
|
if not sections:
|
|||
|
|
raise Exception("未解析到有效内容")
|
|||
|
|
|
|||
|
|
# 获取图片文件
|
|||
|
|
image_files = []
|
|||
|
|
if pair['image_folder']:
|
|||
|
|
image_files = FileHandler.get_image_files(pair['image_folder']['path'])
|
|||
|
|
|
|||
|
|
# 生成DOCX
|
|||
|
|
def update_file_progress(progress, text):
|
|||
|
|
if progress_callback:
|
|||
|
|
sub_progress = int((i + progress / 100) / total * 100)
|
|||
|
|
progress_callback(sub_progress, f"{pair['txt']['name']}: {text}")
|
|||
|
|
|
|||
|
|
DocxGenerator.generate(sections, image_files, output_path, update_file_progress)
|
|||
|
|
success_count += 1
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
failed_items.append({
|
|||
|
|
"name": pair['txt']['name'],
|
|||
|
|
"error": str(e)
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 确定主要输出文件夹
|
|||
|
|
if matched_pairs and success_count > 0:
|
|||
|
|
sample_output = FileHandler.prepare_output_path(matched_pairs[0]['txt'], "", output_root)
|
|||
|
|
main_output_folder = os.path.dirname(sample_output)
|
|||
|
|
else:
|
|||
|
|
main_output_folder = ""
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"total": total,
|
|||
|
|
"success": success_count,
|
|||
|
|
"failed": len(failed_items),
|
|||
|
|
"failed_items": failed_items,
|
|||
|
|
"main_output_folder": main_output_folder
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 配置窗口 - 优化排版
|
|||
|
|
def show_config_window():
|
|||
|
|
"""显示配置窗口 - 优化排版"""
|
|||
|
|
# 创建标签页布局
|
|||
|
|
tab_file_layout = [
|
|||
|
|
[sg.Text('文件处理设置', font=('bold', 12))],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('TXT编码:', size=(12, 1)),
|
|||
|
|
sg.Combo(['utf-8', 'gbk', 'utf-16'], default_value=config.txt_encoding, key='txt_encoding', size=(15, 1))],
|
|||
|
|
[sg.Text('匹配模式:', size=(12, 1))],
|
|||
|
|
[sg.Radio('完全匹配(文件名与文件夹名相同)', 'match', default=config.match_pattern == "exact",
|
|||
|
|
key='match_exact')],
|
|||
|
|
[sg.Radio('前缀匹配', 'match', default=config.match_pattern == "prefix", key='match_prefix')],
|
|||
|
|
[sg.Radio('包含匹配', 'match', default=config.match_pattern == "contains", key='match_contains')],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('输出位置:', size=(12, 1))],
|
|||
|
|
[sg.Radio('输出到TXT文件所在文件夹', 'output_loc', default=config.output_location == "txt_folder",
|
|||
|
|
key='output_txt_folder')],
|
|||
|
|
[sg.Radio('输出到指定文件夹', 'output_loc', default=config.output_location == "custom", key='output_custom')]
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
tab_text_layout = [
|
|||
|
|
[sg.Text('文字处理设置', font=('bold', 12))],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Checkbox('转换文字顺序', key='-REVERSE_TEXT-', default=config.reverse_text_order)],
|
|||
|
|
[sg.Checkbox('替换标点符号(句号转逗号,保留结尾句号)', key='-REPLACE_PUNCTUATION-',
|
|||
|
|
default=config.replace_punctuation)],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('错别字处理', font=('bold', 11), text_color='darkblue')],
|
|||
|
|
[sg.Checkbox('启用错别字处理', key='-ENABLE_CHAR_ERRORS-', default=config.enable_char_errors,
|
|||
|
|
enable_events=True)],
|
|||
|
|
[sg.Text('错误强度:', size=(10, 1)),
|
|||
|
|
sg.Slider(range=(0.0, 1.0), default_value=config.char_error_intensity, resolution=0.1,
|
|||
|
|
orientation='h', size=(20, 15), key='char_error_intensity', disabled=not config.enable_char_errors)],
|
|||
|
|
[sg.Text('错别字库路径:', size=(12, 1)),
|
|||
|
|
sg.InputText(config.char_error_db_path, key='char_error_db_path', size=(30, 1),
|
|||
|
|
disabled=not config.enable_char_errors),
|
|||
|
|
sg.FileBrowse('浏览', file_types=(("JSON Files", "*.json"),), disabled=not config.enable_char_errors)],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Checkbox('添加免责声明', key='-ADD_DISCLAIMER-', default=config.add_disclaimer)]
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
tab_image_layout = [
|
|||
|
|
[sg.Text('图片处理设置', font=('bold', 12))],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('图片排序方式:', size=(12, 1))],
|
|||
|
|
[sg.Radio('按名称', 'sort', default=config.image_sort_by == "name", key='sort_name'),
|
|||
|
|
sg.Radio('按修改时间', 'sort', default=config.image_sort_by == "time", key='sort_time')],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('图片尺寸调整:', size=(12, 1))],
|
|||
|
|
[sg.Radio('不调整', 'resize', default=config.image_resize == "none", key='resize_none')],
|
|||
|
|
[sg.Radio('按宽度:', 'resize', default=config.image_resize == "width", key='resize_width'),
|
|||
|
|
sg.InputText(str(config.image_width), size=(8, 1), key='image_width'),
|
|||
|
|
sg.Text('英寸')],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('图片对齐方式:', size=(12, 1))],
|
|||
|
|
[sg.Radio('左对齐', 'align', default=config.image_alignment == "left", key='align_left'),
|
|||
|
|
sg.Radio('居中', 'align', default=config.image_alignment == "center", key='align_center'),
|
|||
|
|
sg.Radio('右对齐', 'align', default=config.image_alignment == "right", key='align_right')],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('图片不足时策略:', size=(12, 1))],
|
|||
|
|
[sg.Radio('循环使用', 'strategy', default=config.image_strategy == "cycle", key='strategy_cycle')],
|
|||
|
|
[sg.Radio('忽略多余标题', 'strategy', default=config.image_strategy == "truncate", key='strategy_truncate')],
|
|||
|
|
[sg.Radio('重复最后一张', 'strategy', default=config.image_strategy == "repeat_last", key='strategy_repeat')]
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
tab_format_layout = [
|
|||
|
|
[sg.Text('文档格式设置', font=('bold', 12))],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('行间距:', size=(12, 1)),
|
|||
|
|
sg.InputText(str(config.line_spacing), size=(8, 1), key='line_spacing')],
|
|||
|
|
[sg.Text('最大标题层级:', size=(12, 1)),
|
|||
|
|
sg.Combo([1, 2, 3, 4, 5, 6], default_value=config.title_levels, key='title_levels', size=(8, 1))]
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
layout = [
|
|||
|
|
[sg.TabGroup([
|
|||
|
|
[sg.Tab('文件处理', tab_file_layout, key='tab_file')],
|
|||
|
|
[sg.Tab('文字处理', tab_text_layout, key='tab_text')],
|
|||
|
|
[sg.Tab('图片处理', tab_image_layout, key='tab_image')],
|
|||
|
|
[sg.Tab('文档格式', tab_format_layout, key='tab_format')]
|
|||
|
|
])],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Button('确定', size=(10, 1)), sg.Button('取消', size=(10, 1)), sg.Button('重置为默认', size=(12, 1))]
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
window = sg.Window('转换设置', layout, modal=True, resizable=True, size=(500, 450))
|
|||
|
|
|
|||
|
|
while True:
|
|||
|
|
event, values = window.read()
|
|||
|
|
|
|||
|
|
if event in (sg.WIN_CLOSED, '取消'):
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 处理错别字启用/禁用事件
|
|||
|
|
if event == '-ENABLE_CHAR_ERRORS-':
|
|||
|
|
enabled = values['-ENABLE_CHAR_ERRORS-']
|
|||
|
|
window['char_error_intensity'].update(disabled=not enabled)
|
|||
|
|
window['char_error_db_path'].update(disabled=not enabled)
|
|||
|
|
|
|||
|
|
if event == '重置为默认':
|
|||
|
|
# 重置为默认值
|
|||
|
|
default_config = Config()
|
|||
|
|
window['txt_encoding'].update(default_config.txt_encoding)
|
|||
|
|
window['match_exact'].update(True)
|
|||
|
|
window['output_txt_folder'].update(True)
|
|||
|
|
window['-REVERSE_TEXT-'].update(default_config.reverse_text_order)
|
|||
|
|
window['-REPLACE_PUNCTUATION-'].update(default_config.replace_punctuation)
|
|||
|
|
window['-ENABLE_CHAR_ERRORS-'].update(default_config.enable_char_errors)
|
|||
|
|
window['char_error_intensity'].update(default_config.char_error_intensity)
|
|||
|
|
window['char_error_db_path'].update(default_config.char_error_db_path)
|
|||
|
|
window['-ADD_DISCLAIMER-'].update(default_config.add_disclaimer)
|
|||
|
|
window['sort_name'].update(True)
|
|||
|
|
window['resize_none'].update(True)
|
|||
|
|
window['image_width'].update(str(default_config.image_width))
|
|||
|
|
window['align_center'].update(True)
|
|||
|
|
window['strategy_cycle'].update(True)
|
|||
|
|
window['line_spacing'].update(str(default_config.line_spacing))
|
|||
|
|
window['title_levels'].update(default_config.title_levels)
|
|||
|
|
|
|||
|
|
if event == '确定':
|
|||
|
|
# 保存配置
|
|||
|
|
config.txt_encoding = values['txt_encoding']
|
|||
|
|
|
|||
|
|
if values['match_exact']:
|
|||
|
|
config.match_pattern = "exact"
|
|||
|
|
elif values['match_prefix']:
|
|||
|
|
config.match_pattern = "prefix"
|
|||
|
|
else:
|
|||
|
|
config.match_pattern = "contains"
|
|||
|
|
|
|||
|
|
config.output_location = "txt_folder" if values['output_txt_folder'] else "custom"
|
|||
|
|
config.image_sort_by = "name" if values['sort_name'] else "time"
|
|||
|
|
config.image_resize = "none" if values['resize_none'] else "width"
|
|||
|
|
config.reverse_text_order = values['-REVERSE_TEXT-']
|
|||
|
|
config.replace_punctuation = values['-REPLACE_PUNCTUATION-']
|
|||
|
|
config.add_disclaimer = values['-ADD_DISCLAIMER-']
|
|||
|
|
|
|||
|
|
# 错别字处理配置
|
|||
|
|
config.enable_char_errors = values['-ENABLE_CHAR_ERRORS-']
|
|||
|
|
config.char_error_intensity = values['char_error_intensity']
|
|||
|
|
config.char_error_db_path = values['char_error_db_path']
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
config.image_width = float(values['image_width'])
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if values['align_left']:
|
|||
|
|
config.image_alignment = "left"
|
|||
|
|
elif values['align_right']:
|
|||
|
|
config.image_alignment = "right"
|
|||
|
|
else:
|
|||
|
|
config.image_alignment = "center"
|
|||
|
|
|
|||
|
|
if values['strategy_cycle']:
|
|||
|
|
config.image_strategy = "cycle"
|
|||
|
|
elif values['strategy_truncate']:
|
|||
|
|
config.image_strategy = "truncate"
|
|||
|
|
else:
|
|||
|
|
config.image_strategy = "repeat_last"
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
config.line_spacing = float(values['line_spacing'])
|
|||
|
|
config.title_levels = int(values['title_levels'])
|
|||
|
|
except:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
config.save_to_file(CONFIG_FILE_PATH)
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
window.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 匹配编辑窗口
|
|||
|
|
def show_matching_editor(matched_pairs, images_root):
|
|||
|
|
"""显示匹配编辑窗口,允许手动调整匹配关系"""
|
|||
|
|
all_image_folders = []
|
|||
|
|
if os.path.isdir(images_root):
|
|||
|
|
for root, dirs, _ in os.walk(images_root):
|
|||
|
|
for dir in dirs:
|
|||
|
|
folder_path = os.path.join(root, dir)
|
|||
|
|
rel_path = os.path.relpath(folder_path, images_root)
|
|||
|
|
all_image_folders.append((folder_path, rel_path))
|
|||
|
|
|
|||
|
|
table_data = []
|
|||
|
|
for i, pair in enumerate(matched_pairs):
|
|||
|
|
txt_name = pair['txt']['name']
|
|||
|
|
img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配"
|
|||
|
|
table_data.append([i, txt_name, img_folder])
|
|||
|
|
|
|||
|
|
layout = [
|
|||
|
|
[sg.Text('文件匹配编辑', font=('bold', 14))],
|
|||
|
|
[sg.Text('选择要修改的项目,然后从右侧选择图片文件夹')],
|
|||
|
|
[
|
|||
|
|
sg.Table(
|
|||
|
|
values=table_data,
|
|||
|
|
headings=['序号', 'TXT文件名', '匹配的图片文件夹'],
|
|||
|
|
key='-TABLE-',
|
|||
|
|
select_mode=sg.TABLE_SELECT_MODE_BROWSE,
|
|||
|
|
enable_events=True,
|
|||
|
|
justification='left',
|
|||
|
|
size=(None, 15)
|
|||
|
|
),
|
|||
|
|
sg.VSeparator(),
|
|||
|
|
sg.Listbox(
|
|||
|
|
values=[f[1] for f in all_image_folders],
|
|||
|
|
key='-FOLDERS-',
|
|||
|
|
size=(40, 15),
|
|||
|
|
enable_events=True
|
|||
|
|
)
|
|||
|
|
],
|
|||
|
|
[sg.Button('设置选中项'), sg.Button('清除选中项'), sg.Button('应用所有')]
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
window = sg.Window('匹配编辑', layout, resizable=True)
|
|||
|
|
selected_row = None
|
|||
|
|
|
|||
|
|
while True:
|
|||
|
|
event, values = window.read()
|
|||
|
|
|
|||
|
|
if event in (sg.WIN_CLOSED, '应用所有'):
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if event == '-TABLE-':
|
|||
|
|
if values['-TABLE-']:
|
|||
|
|
selected_row = values['-TABLE-'][0]
|
|||
|
|
|
|||
|
|
if event == '设置选中项' and selected_row is not None and values['-FOLDERS-']:
|
|||
|
|
folder_idx = [i for i, f in enumerate(all_image_folders) if f[1] == values['-FOLDERS-'][0]][0]
|
|||
|
|
folder_path, folder_rel = all_image_folders[folder_idx]
|
|||
|
|
|
|||
|
|
matched_pairs[selected_row]['image_folder'] = {
|
|||
|
|
"path": folder_path,
|
|||
|
|
"name": os.path.basename(folder_path),
|
|||
|
|
"relative_path": folder_rel
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
table_data[selected_row][2] = folder_rel
|
|||
|
|
window['-TABLE-'].update(values=table_data)
|
|||
|
|
|
|||
|
|
if event == '清除选中项' and selected_row is not None:
|
|||
|
|
matched_pairs[selected_row]['image_folder'] = None
|
|||
|
|
table_data[selected_row][2] = "无匹配"
|
|||
|
|
window['-TABLE-'].update(values=table_data)
|
|||
|
|
|
|||
|
|
window.close()
|
|||
|
|
return matched_pairs
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 帮助窗口
|
|||
|
|
def show_help_window():
|
|||
|
|
"""显示帮助窗口"""
|
|||
|
|
help_text = """
|
|||
|
|
批量Markdown TXT转DOCX工具使用说明:
|
|||
|
|
|
|||
|
|
1. 选择包含Markdown内容的TXT文件所在文件夹
|
|||
|
|
2. 选择图片文件夹的根目录(程序会自动查找子文件夹)
|
|||
|
|
3. 选择输出文件的保存根目录(当选择"输出到指定文件夹"时有效)
|
|||
|
|
4. 点击"扫描文件"按钮,程序会自动匹配TXT文件和图片文件夹
|
|||
|
|
5. 查看匹配结果,可点击"编辑匹配"调整匹配关系
|
|||
|
|
6. 点击"开始批量转换"生成DOCX文件
|
|||
|
|
|
|||
|
|
支持的Markdown格式:
|
|||
|
|
- 标题:# ## ### #### ##### ######
|
|||
|
|
- 粗体:**文字** 或 __文字__
|
|||
|
|
- 斜体:*文字* 或 _文字_
|
|||
|
|
- 行内代码:`代码`
|
|||
|
|
- 代码块:```语言\\n代码\\n```
|
|||
|
|
- 删除线:~~文字~~
|
|||
|
|
- 链接:[链接文字](URL)
|
|||
|
|
- 图片:
|
|||
|
|
- 无序列表:- 或 * 或 +
|
|||
|
|
- 有序列表:1. 2. 3.
|
|||
|
|
- 引用:> 引用内容
|
|||
|
|
- 表格:| 列1 | 列2 |
|
|||
|
|
- 水平分隔线:--- 或 *** 或 ___
|
|||
|
|
|
|||
|
|
文字处理功能:
|
|||
|
|
- 转换文字顺序:将文字内容进行特定转换处理
|
|||
|
|
- 错别字处理:可以按设定强度引入常见的错别字,用于测试或特殊用途
|
|||
|
|
- 标点符号替换:将句号转换为逗号,保留文末句号
|
|||
|
|
|
|||
|
|
输出路径选择:
|
|||
|
|
- 输出到TXT文件所在文件夹: 每个DOCX文件会直接保存在对应TXT文件所在的文件夹中
|
|||
|
|
- 输出到指定文件夹: 所有DOCX文件会直接保存在您指定的文件夹中
|
|||
|
|
|
|||
|
|
匹配规则:
|
|||
|
|
- 完全匹配: TXT文件名(不含扩展名)与图片文件夹名完全相同
|
|||
|
|
- 前缀匹配: 图片文件夹名以前缀形式包含TXT文件名
|
|||
|
|
- 包含匹配: 图片文件夹名中包含TXT文件名
|
|||
|
|
|
|||
|
|
转换规则:
|
|||
|
|
- 每个小标题的第一段后会插入一张图片
|
|||
|
|
- 先将Markdown格式转换为DOCX格式,再处理文字内容
|
|||
|
|
- 支持文字顺序调换、错别字处理和标点符号替换功能
|
|||
|
|
|
|||
|
|
错别字处理说明:
|
|||
|
|
- 错误强度:控制替换比例,0.0表示不替换,1.0表示替换所有可能的字
|
|||
|
|
- 错别字库:可自定义JSON格式的错别字映射文件
|
|||
|
|
- 常见映射:的↔地↔得、在↔再、是↔事等
|
|||
|
|
"""
|
|||
|
|
sg.popup_scrolled('使用帮助', help_text, size=(70, 25))
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 结果窗口
|
|||
|
|
def show_results_window(results):
|
|||
|
|
"""显示批量处理结果窗口"""
|
|||
|
|
if results['failed'] == 0:
|
|||
|
|
message = f"全部成功!\n共处理 {results['total']} 个文件,全部转换成功。"
|
|||
|
|
if results['main_output_folder']:
|
|||
|
|
message += f"\n主要输出文件夹: {results['main_output_folder']}"
|
|||
|
|
sg.popup('处理完成', message)
|
|||
|
|
else:
|
|||
|
|
failed_text = "\n".join([f"- {item['name']}: {item['error']}" for item in results['failed_items']])
|
|||
|
|
message = (f"处理完成!\n共处理 {results['total']} 个文件,"
|
|||
|
|
f"{results['success']} 个成功,{results['failed']} 个失败。\n\n"
|
|||
|
|
f"失败项:\n{failed_text}")
|
|||
|
|
if results['main_output_folder']:
|
|||
|
|
message += f"\n主要输出文件夹: {results['main_output_folder']}"
|
|||
|
|
sg.popup_scrolled('处理完成', message, size=(60, 20))
|
|||
|
|
|
|||
|
|
# 询问是否打开输出文件夹
|
|||
|
|
if results['main_output_folder'] and os.path.exists(results['main_output_folder']):
|
|||
|
|
if sg.popup_yes_no('是否打开主要输出文件夹?') == 'Yes':
|
|||
|
|
if sys.platform.startswith('win'):
|
|||
|
|
os.startfile(results['main_output_folder'])
|
|||
|
|
elif sys.platform.startswith('darwin'):
|
|||
|
|
os.system(f'open "{results["main_output_folder"]}"')
|
|||
|
|
else:
|
|||
|
|
os.system(f'xdg-open "{results["main_output_folder"]}"')
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 主界面
|
|||
|
|
def main_window():
|
|||
|
|
"""主界面"""
|
|||
|
|
sg.theme('BlueMono')
|
|||
|
|
matched_pairs = []
|
|||
|
|
|
|||
|
|
layout = [
|
|||
|
|
[sg.Text('批量Markdown TXT转DOCX工具', font=('bold', 16))],
|
|||
|
|
[sg.Text('(按文件名匹配TXT文件和图片文件夹,支持完整Markdown格式)', text_color='gray')],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('TXT文件文件夹:', size=(15, 1)),
|
|||
|
|
sg.InputText(key='txt_folder', enable_events=True, default_text=config.last_txt_folder),
|
|||
|
|
sg.FolderBrowse('浏览')],
|
|||
|
|
[sg.Text('图片根文件夹:', size=(15, 1)),
|
|||
|
|
sg.InputText(key='images_root', enable_events=True, default_text=config.last_images_root),
|
|||
|
|
sg.FolderBrowse('浏览')],
|
|||
|
|
[sg.Text('输出根文件夹:', size=(15, 1)),
|
|||
|
|
sg.InputText(key='output_root', enable_events=True, default_text=config.last_output_root),
|
|||
|
|
sg.FolderBrowse('浏览'),
|
|||
|
|
sg.Text('(当选择"输出到指定文件夹"时有效)', text_color='gray')],
|
|||
|
|
[sg.Button('扫描文件', size=(12, 1)),
|
|||
|
|
sg.Button('编辑匹配', size=(12, 1), disabled=True),
|
|||
|
|
sg.Button('转换设置', size=(12, 1)),
|
|||
|
|
sg.Button('帮助', size=(8, 1))],
|
|||
|
|
[sg.HSeparator()],
|
|||
|
|
[sg.Text('匹配结果预览:', font=('bold', 10))],
|
|||
|
|
[sg.Table(
|
|||
|
|
values=[],
|
|||
|
|
headings=['TXT文件名', '相对路径', '匹配的图片文件夹'],
|
|||
|
|
key='-PREVIEW_TABLE-',
|
|||
|
|
auto_size_columns=False,
|
|||
|
|
col_widths=[20, 30, 30],
|
|||
|
|
justification='left',
|
|||
|
|
size=(None, 10)
|
|||
|
|
)],
|
|||
|
|
[sg.ProgressBar(100, orientation='h', size=(80, 20), key='progress_bar', visible=False)],
|
|||
|
|
[sg.Text('状态: 就绪', key='status_text', size=(80, 1))],
|
|||
|
|
[sg.Button('开始批量转换', size=(15, 1), disabled=True), sg.Button('退出')]
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
window = sg.Window('批量Markdown TXT转DOCX工具', layout, resizable=True)
|
|||
|
|
progress_bar = window['progress_bar']
|
|||
|
|
status_text = window['status_text']
|
|||
|
|
preview_table = window['-PREVIEW_TABLE-']
|
|||
|
|
output_root_input = window['output_root']
|
|||
|
|
|
|||
|
|
def update_output_root_state():
|
|||
|
|
"""根据配置更新输出根文件夹输入框的状态"""
|
|||
|
|
if config.output_location == "custom":
|
|||
|
|
output_root_input.update(disabled=False)
|
|||
|
|
output_root_input.Widget.configure(foreground='black')
|
|||
|
|
else:
|
|||
|
|
output_root_input.update(disabled=True)
|
|||
|
|
output_root_input.Widget.configure(foreground='gray')
|
|||
|
|
|
|||
|
|
window.read(timeout=1)
|
|||
|
|
update_output_root_state()
|
|||
|
|
|
|||
|
|
while True:
|
|||
|
|
event, values = window.read()
|
|||
|
|
|
|||
|
|
if event in (sg.WIN_CLOSED, '退出'):
|
|||
|
|
if values is not None:
|
|||
|
|
config.last_txt_folder = values.get('txt_folder', '')
|
|||
|
|
config.last_images_root = values.get('images_root', '')
|
|||
|
|
config.last_output_root = values.get('output_root', '')
|
|||
|
|
config.save_to_file(CONFIG_FILE_PATH)
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if event == '转换设置':
|
|||
|
|
current_output_root = values['output_root']
|
|||
|
|
show_config_window()
|
|||
|
|
update_output_root_state()
|
|||
|
|
window['output_root'].update(current_output_root)
|
|||
|
|
|
|||
|
|
if event == '帮助':
|
|||
|
|
show_help_window()
|
|||
|
|
|
|||
|
|
if event == '扫描文件':
|
|||
|
|
txt_folder = values['txt_folder']
|
|||
|
|
images_root = values['images_root']
|
|||
|
|
|
|||
|
|
if not txt_folder:
|
|||
|
|
sg.popup_error('请选择TXT文件所在的文件夹')
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if not images_root:
|
|||
|
|
sg.popup_error('请选择图片根文件夹')
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
config.last_txt_folder = txt_folder
|
|||
|
|
config.last_images_root = images_root
|
|||
|
|
if values['output_root']:
|
|||
|
|
config.last_output_root = values['output_root']
|
|||
|
|
config.save_to_file(CONFIG_FILE_PATH)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
status_text.update('正在扫描TXT文件...')
|
|||
|
|
window.refresh()
|
|||
|
|
txt_files = FileHandler.scan_txt_files(txt_folder)
|
|||
|
|
|
|||
|
|
status_text.update('正在匹配图片文件夹...')
|
|||
|
|
window.refresh()
|
|||
|
|
matched_pairs = FileHandler.find_matching_image_folders(txt_files, images_root)
|
|||
|
|
|
|||
|
|
table_data = []
|
|||
|
|
for pair in matched_pairs:
|
|||
|
|
img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配"
|
|||
|
|
table_data.append([
|
|||
|
|
pair['txt']['name'],
|
|||
|
|
pair['txt']['relative_path'],
|
|||
|
|
img_folder
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
preview_table.update(values=table_data)
|
|||
|
|
status_text.update(f'扫描完成: 找到 {len(matched_pairs)} 个TXT文件')
|
|||
|
|
|
|||
|
|
window['编辑匹配'].update(disabled=False)
|
|||
|
|
window['开始批量转换'].update(disabled=False)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
sg.popup_error(f'扫描失败: {str(e)}')
|
|||
|
|
status_text.update('状态: 扫描失败')
|
|||
|
|
|
|||
|
|
if event == '编辑匹配' and matched_pairs:
|
|||
|
|
images_root = values['images_root']
|
|||
|
|
if not images_root:
|
|||
|
|
sg.popup_error('请选择图片根文件夹')
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
matched_pairs = show_matching_editor(matched_pairs, images_root)
|
|||
|
|
|
|||
|
|
table_data = []
|
|||
|
|
for pair in matched_pairs:
|
|||
|
|
img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配"
|
|||
|
|
table_data.append([
|
|||
|
|
pair['txt']['name'],
|
|||
|
|
pair['txt']['relative_path'],
|
|||
|
|
img_folder
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
preview_table.update(values=table_data)
|
|||
|
|
|
|||
|
|
if event == '开始批量转换' and matched_pairs:
|
|||
|
|
if config.output_location == "custom" and not values['output_root']:
|
|||
|
|
sg.popup_error('请选择输出根文件夹(在"转换设置"中选择了"输出到指定文件夹")')
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
progress_bar.update(0, visible=True)
|
|||
|
|
status_text.update('开始批量转换...')
|
|||
|
|
window.refresh()
|
|||
|
|
|
|||
|
|
def update_batch_progress(progress, text):
|
|||
|
|
progress_bar.update(progress)
|
|||
|
|
status_text.update(f'状态: {text}')
|
|||
|
|
window.refresh()
|
|||
|
|
|
|||
|
|
results = BatchProcessor.process_batch(matched_pairs, values['output_root'], update_batch_progress)
|
|||
|
|
show_results_window(results)
|
|||
|
|
status_text.update('状态: 批量转换完成')
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
sg.popup_error(f'批量处理失败: {str(e)}')
|
|||
|
|
status_text.update('状态: 批量转换失败')
|
|||
|
|
finally:
|
|||
|
|
progress_bar.update(0, visible=False)
|
|||
|
|
|
|||
|
|
if (event == 'txt_folder' or event == 'images_root') and values[event] and not values['output_root']:
|
|||
|
|
default_output = values['txt_folder'] if values['txt_folder'] else values['images_root']
|
|||
|
|
window['output_root'].update(default_output)
|
|||
|
|
|
|||
|
|
window.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 程序入口
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
main_window()
|