From e1d93a4160d7965555339a73a93a7a8b3c109e17 Mon Sep 17 00:00:00 2001 From: taiyi Date: Tue, 9 Sep 2025 11:30:38 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=8E=B7=E5=8F=96=E7=BD=91?= =?UTF-8?q?=E9=A1=B5=E5=86=85=E5=AE=B9=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Txt2docx2.py | 1396 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1396 insertions(+) create mode 100644 Txt2docx2.py diff --git a/Txt2docx2.py b/Txt2docx2.py new file mode 100644 index 0000000..bc31bd0 --- /dev/null +++ b/Txt2docx2.py @@ -0,0 +1,1396 @@ +import os +import sys +import glob +import re +from PIL import Image +from docx import Document +from docx.shared import Inches, Pt, RGBColor +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.enum.style import WD_STYLE_TYPE +import PySimpleGUI as sg +from replacestr import replace_text +import configparser + +CONFIG_FILE_PATH = os.path.join(os.path.expanduser("~"), ".txt2md2docx.ini") + + +# 配置设置 +class Config: + def __init__(self): + # 文件处理配置 + self.txt_encoding = "utf-8" + self.match_pattern = "exact" # exact: 完全匹配, prefix: 前缀匹配, contains: 包含 + self.output_location = "txt_folder" # txt_folder or custom + # 最近使用的文件夹路径 + self.last_txt_folder = "" + self.last_images_root = "" + self.last_output_root = "" + # 文字处理 + self.reverse_text_order = False # 转换文字顺序开关 + # 图片处理配置 + self.image_sort_by = "name" # name or time + self.image_resize = "none" # none or width + self.image_width = 6 # 英寸 + self.image_alignment = "center" # left, center, right + self.image_strategy = "cycle" # cycle, truncate, repeat_last + # 文档格式配置 + self.line_spacing = 1.5 + self.title_levels = 6 # 支持的最大标题层级 + self.replace_punctuation = False # 是否替换标点符号 + self.add_disclaimer = False # 是否添加免责声明 + + def load_from_file(self, file_path): + if not os.path.exists(file_path): + return False + + config_parser = configparser.ConfigParser() + config_parser.read(file_path, encoding='utf-8') + + # 加载文件处理配置 + if 'FileHandling' in config_parser: + self.txt_encoding = config_parser.get('FileHandling', 'txt_encoding', fallback=self.txt_encoding) + self.match_pattern = config_parser.get('FileHandling', 'match_pattern', fallback=self.match_pattern) + self.output_location = config_parser.get('FileHandling', 'output_location', + fallback=self.output_location) + self.last_txt_folder = config_parser.get('FileHandling', 'last_txt_folder', + fallback=self.last_txt_folder) + self.last_images_root = config_parser.get('FileHandling', 'last_images_root', + fallback=self.last_images_root) + self.last_output_root = config_parser.get('FileHandling', 'last_output_root', + fallback=self.last_output_root) + + # 加载文字处理配置 + if 'TextProcessing' in config_parser: + self.reverse_text_order = config_parser.getboolean('TextProcessing', 'reverse_text_order', + fallback=self.reverse_text_order) + self.replace_punctuation = config_parser.getboolean('TextProcessing', 'replace_punctuation', + fallback=self.replace_punctuation) + self.add_disclaimer = config_parser.getboolean('TextProcessing', 'add_disclaimer', + fallback=self.add_disclaimer) + + # 加载图片处理配置 + if 'ImageProcessing' in config_parser: + self.image_sort_by = config_parser.get('ImageProcessing', 'image_sort_by', fallback=self.image_sort_by) + self.image_resize = config_parser.get('ImageProcessing', 'image_resize', fallback=self.image_resize) + self.image_width = config_parser.getfloat('ImageProcessing', 'image_width', fallback=self.image_width) + self.image_alignment = config_parser.get('ImageProcessing', 'image_alignment', + fallback=self.image_alignment) + self.image_strategy = config_parser.get('ImageProcessing', 'image_strategy', + fallback=self.image_strategy) + + # 加载文档格式配置 + if 'DocumentFormat' in config_parser: + self.line_spacing = config_parser.getfloat('DocumentFormat', 'line_spacing', fallback=self.line_spacing) + self.title_levels = config_parser.getint('DocumentFormat', 'title_levels', fallback=self.title_levels) + + return True + + def save_to_file(self, file_path): + config_parser = configparser.ConfigParser() + + # 保存文件处理配置 + config_parser['FileHandling'] = { + 'txt_encoding': self.txt_encoding, + 'match_pattern': self.match_pattern, + 'output_location': self.output_location, + 'last_txt_folder': self.last_txt_folder, + 'last_images_root': self.last_images_root, + 'last_output_root': self.last_output_root + } + + # 保存文字处理配置 + config_parser['TextProcessing'] = { + 'reverse_text_order': str(self.reverse_text_order), + 'replace_punctuation': str(self.replace_punctuation), + 'add_disclaimer': str(self.add_disclaimer) + } + + # 保存图片处理配置 + config_parser['ImageProcessing'] = { + 'image_sort_by': self.image_sort_by, + 'image_resize': self.image_resize, + 'image_width': str(self.image_width), + 'image_alignment': self.image_alignment, + 'image_strategy': self.image_strategy + } + + # 保存文档格式配置 + config_parser['DocumentFormat'] = { + 'line_spacing': str(self.line_spacing), + 'title_levels': str(self.title_levels) + } + + with open(file_path, 'w', encoding='utf-8') as f: + config_parser.write(f) + + return True + + +# 全局配置实例 +config = Config() +config.load_from_file(CONFIG_FILE_PATH) + + +# 文字处理工具类 - 增强功能 +class TextProcessor: + @staticmethod + def replace_periods(text: str) -> str: + """ + 将中间出现的句号统一替换为逗号; + 若文本末尾是句号,则直接删除该句号。 + """ + text = text.rstrip() + if not text: + return '' + + # 去掉末尾句号(如果有) + if text[-1] == '。': + text = text[:-1] + + # 把剩余句号替换为逗号 + return text.replace('。', ',') + + + @staticmethod + def reverse_text_order(content): + """反转文本顺序(按字符级反转)""" + if not content: + return content + return content[::-1] + + @staticmethod + def reverse_paragraph_order(content): + """反转段落顺序(保留段落内文字顺序)""" + if not content: + return content + paragraphs = content.split('\n') + return '\n'.join(reversed(paragraphs)) + + @staticmethod + def process_text_content(text): + """统一处理文字内容:顺序调换和标点符号替换""" + if not text or not text.strip(): + return text + + # 先进行文字顺序处理 + if config.reverse_text_order: + text = replace_text(text) + + # 再进行标点符号替换 + if config.replace_punctuation: + text = TextProcessor.replace_periods(text) + + return text + + +# 增强的Markdown解析器 +class MarkdownParser: + # Markdown格式匹配模式 + PATTERNS = { + 'heading': re.compile(r'^(\s*)(#{1,6})\s+(.+)$'), + 'bold_asterisk': re.compile(r'\*\*(.+?)\*\*'), + 'bold_underscore': re.compile(r'__(.+?)__'), + 'italic_asterisk': re.compile(r'(?\s*(.+)$'), + 'horizontal_rule': re.compile(r'^(\s*[-*_]){3,}\s*$'), + 'table_row': re.compile(r'^\|(.+)\|$'), + 'horizontal_rule': re.compile(r'^(\s*[-*_]){3,}\s*$'), + 'table_separator': re.compile(r'^\|(\s*:?-+:?\s*\|)+$') + } + + @staticmethod + def parse(txt_content): + """解析Markdown内容为结构化数据""" + elements = [] + lines = txt_content.split('\n') + i = 0 + current_section = None + in_code_block = False + code_block_content = [] + table_mode = False + table_rows = [] + + while i < len(lines): + line = lines[i].rstrip('\r') + original_line = line + + # 处理代码块 + if line.strip().startswith('```'): + if not in_code_block: + in_code_block = True + language = line.strip()[3:].strip() + code_block_content = [] + i += 1 + continue + else: + in_code_block = False + elements.append({ + 'type': 'code_block', + 'language': language if 'language' in locals() else '', + 'content': '\n'.join(code_block_content), + 'level': 0 + }) + code_block_content = [] + i += 1 + continue + + if in_code_block: + code_block_content.append(line) + i += 1 + continue + + # 处理表格 + table_match = MarkdownParser.PATTERNS['table_row'].match(line) + table_sep_match = MarkdownParser.PATTERNS['table_separator'].match(line) + + if table_match or table_sep_match: + if not table_mode: + table_mode = True + table_rows = [] + + if table_match and not table_sep_match: + cells = [cell.strip() for cell in table_match.group(1).split('|')] + table_rows.append(cells) + + i += 1 + continue + elif table_mode: + # 表格结束 + if table_rows: + elements.append({ + 'type': 'table', + 'rows': table_rows, + 'level': 0 + }) + table_mode = False + table_rows = [] + + # 处理标题 + heading_match = MarkdownParser.PATTERNS['heading'].match(line) + if heading_match: + level = len(heading_match.group(2)) + if level <= config.title_levels: + # 提取标题文本(可能包含粗体等格式) + heading_text = heading_match.group(3).strip() + # 先移除Markdown标记但保留文本内容 + cleaned_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', heading_text) + elements.append({ + 'type': 'heading', + 'level': level, + 'content': heading_text, # 保留原始内容用于格式处理 + 'cleaned_content': cleaned_text # 用于显示的纯文本 + }) + current_section = elements[-1] + current_section['paragraphs'] = [] + i += 1 + continue + + # 处理水平分隔线 + if MarkdownParser.PATTERNS['horizontal_rule'].match(line): + elements.append({ + 'type': 'horizontal_rule', + 'level': 0 + }) + i += 1 + continue + + # 处理列表 + ul_match = MarkdownParser.PATTERNS['unordered_list'].match(line) + ol_match = MarkdownParser.PATTERNS['ordered_list'].match(line) + + if ul_match: + elements.append({ + 'type': 'unordered_list', + 'content': ul_match.group(1), + 'level': 0 + }) + i += 1 + continue + + if ol_match: + elements.append({ + 'type': 'ordered_list', + 'content': ol_match.group(1), + 'level': 0 + }) + i += 1 + continue + + # 处理引用 + quote_match = MarkdownParser.PATTERNS['blockquote'].match(line) + if quote_match: + elements.append({ + 'type': 'blockquote', + 'content': quote_match.group(1), + 'level': 0 + }) + i += 1 + continue + + # 处理空行 + if line.strip() == '': + elements.append({ + 'type': 'empty', + 'content': '', + 'level': 0 + }) + i += 1 + continue + + # 处理普通段落 + elements.append({ + 'type': 'paragraph', + 'content': line, + 'level': 0 + }) + + i += 1 + + # 处理剩余的表格 + if table_mode and table_rows: + elements.append({ + 'type': 'table', + 'rows': table_rows, + 'level': 0 + }) + + return MarkdownParser.group_by_sections(elements) + + @staticmethod + def group_by_sections(elements): + """将解析的元素按标题分组""" + sections = [] + current_section = { + 'type': 'section', + 'level': 0, + 'content': '前置内容', + 'elements': [] + } + + for element in elements: + if element['type'] == 'heading': + # 保存当前section + if current_section['elements']: + sections.append(current_section) + + # 创建新section + current_section = { + 'type': 'section', + 'level': element['level'], + 'content': element['content'], + 'elements': [] + } + else: + current_section['elements'].append(element) + + # 添加最后一个section + if current_section['elements']: + sections.append(current_section) + + return sections + + @staticmethod + def extract_inline_formatting(text): + """提取行内格式信息""" + formatting = [] + + # 提取粗体 (**) + for match in MarkdownParser.PATTERNS['bold_asterisk'].finditer(text): + formatting.append({ + 'type': 'bold', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取粗体 (__) + for match in MarkdownParser.PATTERNS['bold_underscore'].finditer(text): + formatting.append({ + 'type': 'bold', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取斜体 (*) + for match in MarkdownParser.PATTERNS['italic_asterisk'].finditer(text): + # 检查是否与粗体重叠 + overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end'] + for f in formatting if f['type'] == 'bold') + if not overlaps: + formatting.append({ + 'type': 'italic', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取斜体 (_) + for match in MarkdownParser.PATTERNS['italic_underscore'].finditer(text): + overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end'] + for f in formatting if f['type'] in ['bold', 'italic']) + if not overlaps: + formatting.append({ + 'type': 'italic', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取行内代码 + for match in MarkdownParser.PATTERNS['code_inline'].finditer(text): + formatting.append({ + 'type': 'code', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取删除线 + for match in MarkdownParser.PATTERNS['strikethrough'].finditer(text): + formatting.append({ + 'type': 'strikethrough', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取链接 + for match in MarkdownParser.PATTERNS['link'].finditer(text): + formatting.append({ + 'type': 'link', + 'start': match.start(), + 'end': match.end(), + 'text': match.group(1), + 'url': match.group(2) + }) + + # 按位置排序 + formatting.sort(key=lambda x: x['start']) + return formatting + + +# 文件处理模块 +class FileHandler: + @staticmethod + def scan_txt_files(folder_path): + """扫描文件夹中的所有TXT文件""" + if not os.path.isdir(folder_path): + raise Exception(f"TXT文件夹不存在: {folder_path}") + + txt_files = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.lower().endswith(".txt"): + txt_path = os.path.join(root, file) + file_name = os.path.splitext(file)[0] + txt_files.append({ + "path": txt_path, + "name": file_name, + "relative_path": os.path.relpath(txt_path, folder_path), + "folder": root + }) + + if not txt_files: + raise Exception(f"在 {folder_path} 中未找到任何TXT文件") + + return sorted(txt_files, key=lambda x: x["relative_path"]) + + @staticmethod + def find_matching_image_folders(txt_files, images_root): + """根据TXT文件名匹配图片文件夹""" + if not os.path.isdir(images_root): + raise Exception(f"图片根文件夹不存在: {images_root}") + + all_image_folders = [] + for root, dirs, _ in os.walk(images_root): + for dir in dirs: + folder_path = os.path.join(root, dir) + all_image_folders.append({ + "path": folder_path, + "name": dir, + "relative_path": os.path.relpath(folder_path, images_root) + }) + + matched_pairs = [] + for txt in txt_files: + matches = [] + txt_name = txt["name"].lower() + + for img_folder in all_image_folders: + folder_name = img_folder["name"].lower() + + if config.match_pattern == "exact" and txt_name == folder_name: + matches.append(img_folder) + elif config.match_pattern == "prefix" and folder_name.startswith(txt_name): + matches.append(img_folder) + elif config.match_pattern == "contains" and txt_name in folder_name: + matches.append(img_folder) + + if matches: + matches.sort(key=lambda x: len(x["relative_path"])) + matched_pairs.append({ + "txt": txt, + "image_folder": matches[0], + "all_matches": matches + }) + else: + matched_pairs.append({ + "txt": txt, + "image_folder": None, + "all_matches": [] + }) + + return matched_pairs + + @staticmethod + def get_image_files(folder_path): + """获取文件夹中的所有图片文件""" + if not folder_path or not os.path.isdir(folder_path): + return [] + + image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.gif', '*.webp', '*.tiff'] + image_files = [] + + for ext in image_extensions: + image_files.extend(glob.glob(os.path.join(folder_path, ext))) + + if config.image_sort_by == "name": + image_files.sort() + elif config.image_sort_by == "time": + image_files.sort(key=lambda x: os.path.getmtime(x)) + + return image_files + + @staticmethod + def read_markdown_txt(file_path): + """读取含Markdown内容的TXT文件""" + if not os.path.exists(file_path): + raise Exception(f"TXT文件不存在: {file_path}") + + encodings = [config.txt_encoding, "gbk", "utf-16", "iso-8859-1"] + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + content = f.read() + content = content.replace("\r\n", "\n").replace("\r", "\n") + return content + except UnicodeDecodeError: + continue + + raise Exception(f"无法解析TXT文件(编码问题): {file_path}") + + @staticmethod + def prepare_output_path(txt_info, images_root, output_root): + """准备输出文件路径""" + if config.output_location == "txt_folder": + base_folder = txt_info["folder"] + else: + base_folder = output_root + + os.makedirs(base_folder, exist_ok=True) + + txt_name = txt_info["name"] + output_path = os.path.join(base_folder, f"{txt_name}.docx") + + counter = 1 + while os.path.exists(output_path): + output_path = os.path.join(base_folder, f"{txt_name}_{counter}.docx") + counter += 1 + + return output_path + + +# 图片处理模块 +class ImageProcessor: + @staticmethod + def process_image(image_path): + """处理图片""" + try: + with Image.open(image_path) as img: + # 处理图片方向 + if hasattr(img, '_getexif'): + exif = img._getexif() + if exif: + orientation_tag = 274 + if orientation_tag in exif: + orientation = exif[orientation_tag] + if orientation == 3: + img = img.rotate(180, expand=True) + elif orientation == 6: + img = img.rotate(270, expand=True) + elif orientation == 8: + img = img.rotate(90, expand=True) + + # 调整大小 + if config.image_resize == "width" and config.image_width > 0: + target_width_px = config.image_width * 96 + width, height = img.size + + if width > target_width_px: + ratio = target_width_px / width + new_height = int(height * ratio) + img = img.resize((int(target_width_px), new_height), Image.LANCZOS) + + return img, config.image_width + else: + width_in = img.width / 96 + return img, width_in + except Exception as e: + raise Exception(f"处理图片失败 {image_path}: {str(e)}") + + @staticmethod + def get_image_alignment(): + """获取图片对齐方式""" + if config.image_alignment == "left": + return WD_ALIGN_PARAGRAPH.LEFT + elif config.image_alignment == "right": + return WD_ALIGN_PARAGRAPH.RIGHT + else: + return WD_ALIGN_PARAGRAPH.CENTER + + +DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`""" + + +# DOCX生成模块 - 完全重构 +class DocxGenerator: + @staticmethod + def generate(sections, image_files, output_path, progress_callback=None): + """生成DOCX文档 - 重构版本""" + doc = Document() + total_sections = len(sections) + image_index = 0 + image_count = len(image_files) + + for i, section in enumerate(sections): + if progress_callback: + progress = int((i / total_sections) * 100) + progress_callback(progress, f"处理章节: {section['content'][:30]}...") + + # 添加标题 + if section['level'] > 0 and section['level'] <= config.title_levels: + # 使用原始带格式的内容进行处理 + heading_text = TextProcessor.process_text_content(section['content']) + # 创建标题段落 + para = doc.add_heading(level=section['level']) + # 应用行内格式(包括粗体) + DocxGenerator.apply_inline_formatting(para, heading_text) + elif section['content'] != '前置内容': + heading_text = TextProcessor.process_text_content(section['content']) + para = doc.add_paragraph() + run = para.add_run(heading_text) + run.font.size = Pt(14) + run.font.bold = True + para.space_after = Pt(12) + + # 处理章节中的元素 + elements = section.get('elements', []) + if not elements: + continue + + # 处理第一个非空元素后插入图片 + first_content_added = False + + for element in elements: + # 添加元素到文档 + DocxGenerator.add_element_to_doc(doc, element) + + # 在第一个内容元素后插入图片 + if not first_content_added and element['type'] not in ['empty']: + first_content_added = True + + # 插入图片 + if image_count > 0 and image_index < image_count: + try: + DocxGenerator.insert_image(doc, image_files[image_index], output_path) + image_index += 1 + + if image_index >= image_count: + if config.image_strategy == "cycle": + image_index = 0 + elif config.image_strategy == "truncate": + image_index = image_count + + except Exception as e: + doc.add_paragraph(f"[图片插入失败: {str(e)}]") + + # 添加免责声明 + if config.add_disclaimer: + doc.add_paragraph("---") + para = doc.add_paragraph() + disclaimer_text = TextProcessor.process_text_content(DISCLAIMER_TEXT) + run = para.add_run(disclaimer_text) + run.font.size = Pt(10) + para.paragraph_format.line_spacing = 1.0 + + try: + doc.save(output_path) + if progress_callback: + progress_callback(100, "转换完成!") + return True + except Exception as e: + raise Exception(f"保存DOCX失败: {str(e)}") + + @staticmethod + def add_element_to_doc(doc, element): + """将解析的元素添加到文档中""" + etype = element['type'] + content = TextProcessor.process_text_content(element.get('content', '')) + + if etype == 'paragraph': + DocxGenerator.add_formatted_paragraph(doc, content) + + elif etype == 'unordered_list': + para = doc.add_paragraph(style='List Bullet') + DocxGenerator.apply_inline_formatting(para, content) + + elif etype == 'ordered_list': + para = doc.add_paragraph(style='List Number') + DocxGenerator.apply_inline_formatting(para, content) + + elif etype == 'blockquote': + para = doc.add_paragraph(style='Quote') + DocxGenerator.apply_inline_formatting(para, content) + + elif etype == 'code_block': + para = doc.add_paragraph(style='No Spacing') + run = para.add_run(element['content']) + run.font.name = 'Courier New' + run.font.size = Pt(10) + + elif etype == 'table': + DocxGenerator.add_table_to_doc(doc, element['rows']) + + elif etype == 'horizontal_rule': + DocxGenerator.add_horizontal_rule(doc) + + elif etype == 'empty': + doc.add_paragraph() + + + @staticmethod + def add_horizontal_rule(doc): + """在文档中添加横线""" + para = doc.add_paragraph() + run = para.add_run() + # 添加水平线条(使用下划线作为横线) + run.font.underline = True + run.text = " " * 100 # 足够长的下划线作为横线 + para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + @staticmethod + def add_table_to_doc(doc, rows): + """添加表格到文档""" + if not rows: + return + + table = doc.add_table(rows=len(rows), cols=len(rows[0])) + table.style = 'Table Grid' + + for i, row_data in enumerate(rows): + row_cells = table.rows[i].cells + for j, cell_data in enumerate(row_data): + if j < len(row_cells): + # 处理单元格内容的格式和文字处理 + processed_text = TextProcessor.process_text_content(cell_data) + row_cells[j].text = processed_text + + @staticmethod + def insert_image(doc, image_path, output_path): + """插入图片到文档""" + img, width = ImageProcessor.process_image(image_path) + + temp_img_path = None + if config.image_resize == "width": + temp_dir = os.path.dirname(output_path) + os.makedirs(temp_dir, exist_ok=True) + temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png") + img.save(temp_img_path) + img_path = temp_img_path + else: + img_path = image_path + + para = doc.add_paragraph() + run = para.runs[0] if para.runs else para.add_run() + run.add_picture(img_path, width=Inches(width)) + para.alignment = ImageProcessor.get_image_alignment() + + if temp_img_path and os.path.exists(temp_img_path): + try: + os.remove(temp_img_path) + except: + pass # 忽略删除临时文件的错误 + + @staticmethod + def add_formatted_paragraph(doc, content): + """添加带格式的段落""" + if not content or not content.strip(): + doc.add_paragraph() + return + + para = doc.add_paragraph() + DocxGenerator.apply_inline_formatting(para, content) + + if config.line_spacing > 0: + para.paragraph_format.line_spacing = config.line_spacing + + @staticmethod + def apply_inline_formatting(paragraph, text): + """应用行内格式到段落""" + # 首先处理文字内容(顺序调换和标点符号替换) + processed_text = TextProcessor.process_text_content(text) + + # 重新提取格式信息(因为文字可能已经改变) + formatting = MarkdownParser.extract_inline_formatting(processed_text) + + # 如果没有格式,直接添加文本 + if not formatting: + paragraph.add_run(processed_text) + return + + current_pos = 0 + + for fmt in formatting: + # 添加格式前的普通文本 + if fmt['start'] > current_pos: + paragraph.add_run(processed_text[current_pos:fmt['start']]) + + # 创建格式化的run + if fmt['type'] == 'bold': + # 移除markdown标记并应用格式 + clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']]) + run = paragraph.add_run(clean_text) + run.bold = True + + elif fmt['type'] == 'italic': + clean_text = re.sub(r'(? 0: + sample_output = FileHandler.prepare_output_path(matched_pairs[0]['txt'], "", output_root) + main_output_folder = os.path.dirname(sample_output) + else: + main_output_folder = "" + + return { + "total": total, + "success": success_count, + "failed": len(failed_items), + "failed_items": failed_items, + "main_output_folder": main_output_folder + } + + +# 配置窗口 +def show_config_window(): + """显示配置窗口""" + layout = [ + [sg.Text('文件匹配设置', font=('bold', 12))], + [sg.Text('TXT编码:'), + sg.Combo(['utf-8', 'gbk', 'utf-16'], + default_value=config.txt_encoding, key='txt_encoding')], + [sg.Text('匹配模式:'), + sg.Radio('完全匹配(文件名与文件夹名相同)', 'match', + default=config.match_pattern == "exact", key='match_exact'), + sg.Radio('前缀匹配', 'match', + default=config.match_pattern == "prefix", key='match_prefix'), + sg.Radio('包含匹配', 'match', + default=config.match_pattern == "contains", key='match_contains')], + [sg.HSeparator()], + [sg.Checkbox('转换文字顺序', key='-REVERSE_TEXT-', default=config.reverse_text_order)], + [sg.HSeparator()], + [sg.Checkbox('替换标点符号(句号转逗号,保留结尾句号)', + key='-REPLACE_PUNCTUATION-', + default=config.replace_punctuation)], + [sg.HSeparator()], + [sg.Checkbox('添加免责声明', key='-ADD_DISCLAIMER-', default=config.add_disclaimer)], + [sg.HSeparator()], + [sg.Radio('输出到TXT文件所在文件夹', 'output_loc', + default=config.output_location == "txt_folder", key='output_txt_folder'), + sg.Radio('输出到指定文件夹', 'output_loc', + default=config.output_location == "custom", key='output_custom')], + [sg.HSeparator()], + [sg.Text('图片处理设置', font=('bold', 12))], + [sg.Text('图片排序方式:'), + sg.Radio('按名称', 'sort', default=config.image_sort_by == "name", key='sort_name'), + sg.Radio('按修改时间', 'sort', default=config.image_sort_by == "time", key='sort_time')], + [sg.Text('图片尺寸调整:'), + sg.Radio('不调整', 'resize', default=config.image_resize == "none", key='resize_none'), + sg.Radio('按宽度:', 'resize', default=config.image_resize == "width", key='resize_width'), + sg.InputText(str(config.image_width), size=(5, 1), key='image_width'), + sg.Text('英寸')], + [sg.Text('图片对齐方式:'), + sg.Radio('左对齐', 'align', default=config.image_alignment == "left", key='align_left'), + sg.Radio('居中', 'align', default=config.image_alignment == "center", key='align_center'), + sg.Radio('右对齐', 'align', default=config.image_alignment == "right", key='align_right')], + [sg.HSeparator()], + [sg.Text('图片不足时策略', font=('bold', 12))], + [sg.Radio('循环使用', 'strategy', default=config.image_strategy == "cycle", key='strategy_cycle'), + sg.Radio('忽略多余标题', 'strategy', default=config.image_strategy == "truncate", key='strategy_truncate'), + sg.Radio('重复最后一张', 'strategy', default=config.image_strategy == "repeat_last", key='strategy_repeat')], + [sg.HSeparator()], + [sg.Button('确定'), sg.Button('取消')] + ] + + window = sg.Window('转换设置', layout, modal=True, resizable=True) + + while True: + event, values = window.read() + if event in (sg.WIN_CLOSED, '取消'): + break + if event == '确定': + # 保存配置 + config.txt_encoding = values['txt_encoding'] + + if values['match_exact']: + config.match_pattern = "exact" + elif values['match_prefix']: + config.match_pattern = "prefix" + else: + config.match_pattern = "contains" + + config.output_location = "txt_folder" if values['output_txt_folder'] else "custom" + config.image_sort_by = "name" if values['sort_name'] else "time" + config.image_resize = "none" if values['resize_none'] else "width" + config.reverse_text_order = values['-REVERSE_TEXT-'] + config.replace_punctuation = values['-REPLACE_PUNCTUATION-'] + config.add_disclaimer = values['-ADD_DISCLAIMER-'] + + try: + config.image_width = float(values['image_width']) + except: + pass + + if values['align_left']: + config.image_alignment = "left" + elif values['align_right']: + config.image_alignment = "right" + else: + config.image_alignment = "center" + + if values['strategy_cycle']: + config.image_strategy = "cycle" + elif values['strategy_truncate']: + config.image_strategy = "truncate" + else: + config.image_strategy = "repeat_last" + + config.save_to_file(CONFIG_FILE_PATH) + break + + window.close() + + +# 匹配编辑窗口 +def show_matching_editor(matched_pairs, images_root): + """显示匹配编辑窗口,允许手动调整匹配关系""" + all_image_folders = [] + if os.path.isdir(images_root): + for root, dirs, _ in os.walk(images_root): + for dir in dirs: + folder_path = os.path.join(root, dir) + rel_path = os.path.relpath(folder_path, images_root) + all_image_folders.append((folder_path, rel_path)) + + table_data = [] + for i, pair in enumerate(matched_pairs): + txt_name = pair['txt']['name'] + img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" + table_data.append([i, txt_name, img_folder]) + + layout = [ + [sg.Text('文件匹配编辑', font=('bold', 14))], + [sg.Text('选择要修改的项目,然后从右侧选择图片文件夹')], + [ + sg.Table( + values=table_data, + headings=['序号', 'TXT文件名', '匹配的图片文件夹'], + key='-TABLE-', + select_mode=sg.TABLE_SELECT_MODE_BROWSE, + enable_events=True, + justification='left', + size=(None, 15) + ), + sg.VSeparator(), + sg.Listbox( + values=[f[1] for f in all_image_folders], + key='-FOLDERS-', + size=(40, 15), + enable_events=True + ) + ], + [sg.Button('设置选中项'), sg.Button('清除选中项'), sg.Button('应用所有')] + ] + + window = sg.Window('匹配编辑', layout, resizable=True) + selected_row = None + + while True: + event, values = window.read() + + if event in (sg.WIN_CLOSED, '应用所有'): + break + + if event == '-TABLE-': + if values['-TABLE-']: + selected_row = values['-TABLE-'][0] + + if event == '设置选中项' and selected_row is not None and values['-FOLDERS-']: + folder_idx = [i for i, f in enumerate(all_image_folders) if f[1] == values['-FOLDERS-'][0]][0] + folder_path, folder_rel = all_image_folders[folder_idx] + + matched_pairs[selected_row]['image_folder'] = { + "path": folder_path, + "name": os.path.basename(folder_path), + "relative_path": folder_rel + } + + table_data[selected_row][2] = folder_rel + window['-TABLE-'].update(values=table_data) + + if event == '清除选中项' and selected_row is not None: + matched_pairs[selected_row]['image_folder'] = None + table_data[selected_row][2] = "无匹配" + window['-TABLE-'].update(values=table_data) + + window.close() + return matched_pairs + + +# 帮助窗口 +def show_help_window(): + """显示帮助窗口""" + help_text = """ +批量Markdown TXT转DOCX工具使用说明: + +1. 选择包含Markdown内容的TXT文件所在文件夹 +2. 选择图片文件夹的根目录(程序会自动查找子文件夹) +3. 选择输出文件的保存根目录(当选择"输出到指定文件夹"时有效) +4. 点击"扫描文件"按钮,程序会自动匹配TXT文件和图片文件夹 +5. 查看匹配结果,可点击"编辑匹配"调整匹配关系 +6. 点击"开始批量转换"生成DOCX文件 + +支持的Markdown格式: +- 标题:# ## ### #### ##### ###### +- 粗体:**文字** 或 __文字__ +- 斜体:*文字* 或 _文字_ +- 行内代码:`代码` +- 代码块:```语言\\n代码\\n``` +- 删除线:~~文字~~ +- 链接:[链接文字](URL) +- 图片:![图片描述](图片路径) +- 无序列表:- 或 * 或 + +- 有序列表:1. 2. 3. +- 引用:> 引用内容 +- 表格:| 列1 | 列2 | +- 水平分隔线:--- 或 *** 或 ___ + +输出路径选择: +- 输出到TXT文件所在文件夹: 每个DOCX文件会直接保存在对应TXT文件所在的文件夹中 +- 输出到指定文件夹: 所有DOCX文件会直接保存在您指定的文件夹中 + +匹配规则: +- 完全匹配: TXT文件名(不含扩展名)与图片文件夹名完全相同 +- 前缀匹配: 图片文件夹名以前缀形式包含TXT文件名 +- 包含匹配: 图片文件夹名中包含TXT文件名 + +转换规则: +- 每个小标题的第一段后会插入一张图片 +- 先将Markdown格式转换为DOCX格式,再处理文字内容 +- 支持文字顺序调换和标点符号替换功能 +""" + sg.popup_scrolled('使用帮助', help_text, size=(70, 25)) + + +# 结果窗口 +def show_results_window(results): + """显示批量处理结果窗口""" + if results['failed'] == 0: + message = f"全部成功!\n共处理 {results['total']} 个文件,全部转换成功。" + if results['main_output_folder']: + message += f"\n主要输出文件夹: {results['main_output_folder']}" + sg.popup('处理完成', message) + else: + failed_text = "\n".join([f"- {item['name']}: {item['error']}" for item in results['failed_items']]) + message = (f"处理完成!\n共处理 {results['total']} 个文件," + f"{results['success']} 个成功,{results['failed']} 个失败。\n\n" + f"失败项:\n{failed_text}") + if results['main_output_folder']: + message += f"\n主要输出文件夹: {results['main_output_folder']}" + sg.popup_scrolled('处理完成', message, size=(60, 20)) + + # 询问是否打开输出文件夹 + if results['main_output_folder'] and os.path.exists(results['main_output_folder']): + if sg.popup_yes_no('是否打开主要输出文件夹?') == 'Yes': + if sys.platform.startswith('win'): + os.startfile(results['main_output_folder']) + elif sys.platform.startswith('darwin'): + os.system(f'open "{results["main_output_folder"]}"') + else: + os.system(f'xdg-open "{results["main_output_folder"]}"') + + +# 主界面 +def main_window(): + """主界面""" + sg.theme('BlueMono') + matched_pairs = [] + + layout = [ + [sg.Text('批量Markdown TXT转DOCX工具', font=('bold', 16))], + [sg.Text('(按文件名匹配TXT文件和图片文件夹,支持完整Markdown格式)', text_color='gray')], + [sg.HSeparator()], + [sg.Text('TXT文件文件夹:', size=(15, 1)), + sg.InputText(key='txt_folder', enable_events=True, default_text=config.last_txt_folder), + sg.FolderBrowse('浏览')], + [sg.Text('图片根文件夹:', size=(15, 1)), + sg.InputText(key='images_root', enable_events=True, default_text=config.last_images_root), + sg.FolderBrowse('浏览')], + [sg.Text('输出根文件夹:', size=(15, 1)), + sg.InputText(key='output_root', enable_events=True, default_text=config.last_output_root), + sg.FolderBrowse('浏览'), + sg.Text('(当选择"输出到指定文件夹"时有效)', text_color='gray')], + [sg.Button('扫描文件', size=(12, 1)), + sg.Button('编辑匹配', size=(12, 1), disabled=True), + sg.Button('转换设置', size=(12, 1)), + sg.Button('帮助', size=(8, 1))], + [sg.HSeparator()], + [sg.Text('匹配结果预览:', font=('bold', 10))], + [sg.Table( + values=[], + headings=['TXT文件名', '相对路径', '匹配的图片文件夹'], + key='-PREVIEW_TABLE-', + auto_size_columns=False, + col_widths=[20, 30, 30], + justification='left', + size=(None, 10) + )], + [sg.ProgressBar(100, orientation='h', size=(80, 20), key='progress_bar', visible=False)], + [sg.Text('状态: 就绪', key='status_text', size=(80, 1))], + [sg.Button('开始批量转换', size=(15, 1), disabled=True), sg.Button('退出')] + ] + + window = sg.Window('批量Markdown TXT转DOCX工具', layout, resizable=True) + progress_bar = window['progress_bar'] + status_text = window['status_text'] + preview_table = window['-PREVIEW_TABLE-'] + output_root_input = window['output_root'] + + def update_output_root_state(): + """根据配置更新输出根文件夹输入框的状态""" + if config.output_location == "custom": + output_root_input.update(disabled=False) + output_root_input.Widget.configure(foreground='black') + else: + output_root_input.update(disabled=True) + output_root_input.Widget.configure(foreground='gray') + + window.read(timeout=1) + update_output_root_state() + + while True: + event, values = window.read() + + if event in (sg.WIN_CLOSED, '退出'): + if values is not None: + config.last_txt_folder = values.get('txt_folder', '') + config.last_images_root = values.get('images_root', '') + config.last_output_root = values.get('output_root', '') + config.save_to_file(CONFIG_FILE_PATH) + break + + if event == '转换设置': + current_output_root = values['output_root'] + show_config_window() + update_output_root_state() + window['output_root'].update(current_output_root) + + if event == '帮助': + show_help_window() + + if event == '扫描文件': + txt_folder = values['txt_folder'] + images_root = values['images_root'] + + if not txt_folder: + sg.popup_error('请选择TXT文件所在的文件夹') + continue + + if not images_root: + sg.popup_error('请选择图片根文件夹') + continue + + config.last_txt_folder = txt_folder + config.last_images_root = images_root + if values['output_root']: + config.last_output_root = values['output_root'] + config.save_to_file(CONFIG_FILE_PATH) + + try: + status_text.update('正在扫描TXT文件...') + window.refresh() + txt_files = FileHandler.scan_txt_files(txt_folder) + + status_text.update('正在匹配图片文件夹...') + window.refresh() + matched_pairs = FileHandler.find_matching_image_folders(txt_files, images_root) + + table_data = [] + for pair in matched_pairs: + img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" + table_data.append([ + pair['txt']['name'], + pair['txt']['relative_path'], + img_folder + ]) + + preview_table.update(values=table_data) + status_text.update(f'扫描完成: 找到 {len(matched_pairs)} 个TXT文件') + + window['编辑匹配'].update(disabled=False) + window['开始批量转换'].update(disabled=False) + + except Exception as e: + sg.popup_error(f'扫描失败: {str(e)}') + status_text.update('状态: 扫描失败') + + if event == '编辑匹配' and matched_pairs: + images_root = values['images_root'] + if not images_root: + sg.popup_error('请选择图片根文件夹') + continue + + matched_pairs = show_matching_editor(matched_pairs, images_root) + + table_data = [] + for pair in matched_pairs: + img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" + table_data.append([ + pair['txt']['name'], + pair['txt']['relative_path'], + img_folder + ]) + + preview_table.update(values=table_data) + + if event == '开始批量转换' and matched_pairs: + if config.output_location == "custom" and not values['output_root']: + sg.popup_error('请选择输出根文件夹(在"转换设置"中选择了"输出到指定文件夹")') + continue + + try: + progress_bar.update(0, visible=True) + status_text.update('开始批量转换...') + window.refresh() + + def update_batch_progress(progress, text): + progress_bar.update(progress) + status_text.update(f'状态: {text}') + window.refresh() + + results = BatchProcessor.process_batch(matched_pairs, values['output_root'], update_batch_progress) + show_results_window(results) + status_text.update('状态: 批量转换完成') + + except Exception as e: + sg.popup_error(f'批量处理失败: {str(e)}') + status_text.update('状态: 批量转换失败') + + if (event == 'txt_folder' or event == 'images_root') and values[event] and not values['output_root']: + default_output = values['txt_folder'] if values['txt_folder'] else values['images_root'] + window['output_root'].update(default_output) + + window.close() + + +# 程序入口 +if __name__ == '__main__': + main_window() \ No newline at end of file