From 3d2e13c54069642b9d1c0e33bb9827f49da020bb Mon Sep 17 00:00:00 2001 From: taiyi Date: Mon, 11 Aug 2025 18:58:33 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20ArticleReplaceBatch/txt2md?= =?UTF-8?q?2docx.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 更新格式转换 --- ArticleReplaceBatch/txt2md2docx.py | 851 ++++++++++++++++++----------- 1 file changed, 521 insertions(+), 330 deletions(-) diff --git a/ArticleReplaceBatch/txt2md2docx.py b/ArticleReplaceBatch/txt2md2docx.py index 1f4b716..48811b4 100644 --- a/ArticleReplaceBatch/txt2md2docx.py +++ b/ArticleReplaceBatch/txt2md2docx.py @@ -1,16 +1,19 @@ import os import sys import glob +import re from PIL import Image from docx import Document -from docx.shared import Inches, Pt +from docx.shared import Inches, Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.enum.style import WD_STYLE_TYPE import PySimpleGUI as sg from replacestr import replace_text -import configparser # 新增:导入配置文件处理模块 +import configparser CONFIG_FILE_PATH = os.path.join(os.path.expanduser("~"), ".txt2md2docx.ini") + # 配置设置 class Config: def __init__(self): @@ -18,7 +21,7 @@ class Config: self.txt_encoding = "utf-8" self.match_pattern = "exact" # exact: 完全匹配, prefix: 前缀匹配, contains: 包含 self.output_location = "txt_folder" # txt_folder or custom - # 最近使用的文件夹路径 - 新增 + # 最近使用的文件夹路径 self.last_txt_folder = "" self.last_images_root = "" self.last_output_root = "" @@ -36,7 +39,6 @@ class Config: self.replace_punctuation = False # 是否替换标点符号 self.add_disclaimer = False # 是否添加免责声明 - # 新增:从配置文件加载配置 def load_from_file(self, file_path): if not os.path.exists(file_path): return False @@ -83,7 +85,6 @@ class Config: return True - # 新增:保存配置到文件 def save_to_file(self, file_path): config_parser = configparser.ConfigParser() @@ -127,19 +128,15 @@ class Config: # 全局配置实例 config = Config() - -# 新增:尝试加载配置文件 config.load_from_file(CONFIG_FILE_PATH) -# 添加文字处理工具类(可放在FileHandler类之后) +# 文字处理工具类 - 增强功能 class TextProcessor: - @staticmethod def replace_periods(text): - # 去除文本首尾的空白字符 + """替换标点符号:句号转逗号,保留结尾句号""" text = text.strip() - if not text: return "" @@ -169,7 +166,6 @@ class TextProcessor: """反转文本顺序(按字符级反转)""" if not content: return content - # 按字符反转整个文本 return content[::-1] @staticmethod @@ -178,10 +174,316 @@ class TextProcessor: if not content: return content paragraphs = content.split('\n') - # 反转段落列表并重新拼接 return '\n'.join(reversed(paragraphs)) -# 文件处理模块 - 增强文件夹和匹配处理 + @staticmethod + def process_text_content(text): + """统一处理文字内容:顺序调换和标点符号替换""" + if not text or not text.strip(): + return text + + # 先进行文字顺序处理 + if config.reverse_text_order: + text = replace_text(text) + + # 再进行标点符号替换 + if config.replace_punctuation: + text = TextProcessor.replace_periods(text) + + return text + + +# 增强的Markdown解析器 +class MarkdownParser: + # Markdown格式匹配模式 + PATTERNS = { + 'heading': re.compile(r'^(\s*)(#{1,6})\s+(.+)$'), + 'bold_asterisk': re.compile(r'\*\*(.+?)\*\*'), + 'bold_underscore': re.compile(r'__(.+?)__'), + 'italic_asterisk': re.compile(r'(?\s*(.+)$'), + 'horizontal_rule': re.compile(r'^(\s*[-*_]){3,}\s*$'), + 'table_row': re.compile(r'^\|(.+)\|$'), + 'table_separator': re.compile(r'^\|(\s*:?-+:?\s*\|)+$') + } + + @staticmethod + def parse(txt_content): + """解析Markdown内容为结构化数据""" + elements = [] + lines = txt_content.split('\n') + i = 0 + current_section = None + in_code_block = False + code_block_content = [] + table_mode = False + table_rows = [] + + while i < len(lines): + line = lines[i].rstrip('\r') + original_line = line + + # 处理代码块 + if line.strip().startswith('```'): + if not in_code_block: + in_code_block = True + language = line.strip()[3:].strip() + code_block_content = [] + i += 1 + continue + else: + in_code_block = False + elements.append({ + 'type': 'code_block', + 'language': language if 'language' in locals() else '', + 'content': '\n'.join(code_block_content), + 'level': 0 + }) + code_block_content = [] + i += 1 + continue + + if in_code_block: + code_block_content.append(line) + i += 1 + continue + + # 处理表格 + table_match = MarkdownParser.PATTERNS['table_row'].match(line) + table_sep_match = MarkdownParser.PATTERNS['table_separator'].match(line) + + if table_match or table_sep_match: + if not table_mode: + table_mode = True + table_rows = [] + + if table_match and not table_sep_match: + cells = [cell.strip() for cell in table_match.group(1).split('|')] + table_rows.append(cells) + + i += 1 + continue + elif table_mode: + # 表格结束 + if table_rows: + elements.append({ + 'type': 'table', + 'rows': table_rows, + 'level': 0 + }) + table_mode = False + table_rows = [] + + # 处理标题 + heading_match = MarkdownParser.PATTERNS['heading'].match(line) + if heading_match: + level = len(heading_match.group(2)) + if level <= config.title_levels: + heading_text = heading_match.group(3).strip() + elements.append({ + 'type': 'heading', + 'level': level, + 'content': heading_text + }) + current_section = elements[-1] + current_section['paragraphs'] = [] + i += 1 + continue + + # 处理水平分隔线 + if MarkdownParser.PATTERNS['horizontal_rule'].match(line): + elements.append({ + 'type': 'horizontal_rule', + 'level': 0 + }) + i += 1 + continue + + # 处理列表 + ul_match = MarkdownParser.PATTERNS['unordered_list'].match(line) + ol_match = MarkdownParser.PATTERNS['ordered_list'].match(line) + + if ul_match: + elements.append({ + 'type': 'unordered_list', + 'content': ul_match.group(1), + 'level': 0 + }) + i += 1 + continue + + if ol_match: + elements.append({ + 'type': 'ordered_list', + 'content': ol_match.group(1), + 'level': 0 + }) + i += 1 + continue + + # 处理引用 + quote_match = MarkdownParser.PATTERNS['blockquote'].match(line) + if quote_match: + elements.append({ + 'type': 'blockquote', + 'content': quote_match.group(1), + 'level': 0 + }) + i += 1 + continue + + # 处理空行 + if line.strip() == '': + elements.append({ + 'type': 'empty', + 'content': '', + 'level': 0 + }) + i += 1 + continue + + # 处理普通段落 + elements.append({ + 'type': 'paragraph', + 'content': line, + 'level': 0 + }) + + i += 1 + + # 处理剩余的表格 + if table_mode and table_rows: + elements.append({ + 'type': 'table', + 'rows': table_rows, + 'level': 0 + }) + + return MarkdownParser.group_by_sections(elements) + + @staticmethod + def group_by_sections(elements): + """将解析的元素按标题分组""" + sections = [] + current_section = { + 'type': 'section', + 'level': 0, + 'content': '前置内容', + 'elements': [] + } + + for element in elements: + if element['type'] == 'heading': + # 保存当前section + if current_section['elements']: + sections.append(current_section) + + # 创建新section + current_section = { + 'type': 'section', + 'level': element['level'], + 'content': element['content'], + 'elements': [] + } + else: + current_section['elements'].append(element) + + # 添加最后一个section + if current_section['elements']: + sections.append(current_section) + + return sections + + @staticmethod + def extract_inline_formatting(text): + """提取行内格式信息""" + formatting = [] + + # 提取粗体 (**) + for match in MarkdownParser.PATTERNS['bold_asterisk'].finditer(text): + formatting.append({ + 'type': 'bold', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取粗体 (__) + for match in MarkdownParser.PATTERNS['bold_underscore'].finditer(text): + formatting.append({ + 'type': 'bold', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取斜体 (*) + for match in MarkdownParser.PATTERNS['italic_asterisk'].finditer(text): + # 检查是否与粗体重叠 + overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end'] + for f in formatting if f['type'] == 'bold') + if not overlaps: + formatting.append({ + 'type': 'italic', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取斜体 (_) + for match in MarkdownParser.PATTERNS['italic_underscore'].finditer(text): + overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end'] + for f in formatting if f['type'] in ['bold', 'italic']) + if not overlaps: + formatting.append({ + 'type': 'italic', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取行内代码 + for match in MarkdownParser.PATTERNS['code_inline'].finditer(text): + formatting.append({ + 'type': 'code', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取删除线 + for match in MarkdownParser.PATTERNS['strikethrough'].finditer(text): + formatting.append({ + 'type': 'strikethrough', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取链接 + for match in MarkdownParser.PATTERNS['link'].finditer(text): + formatting.append({ + 'type': 'link', + 'start': match.start(), + 'end': match.end(), + 'text': match.group(1), + 'url': match.group(2) + }) + + # 按位置排序 + formatting.sort(key=lambda x: x['start']) + return formatting + + +# 文件处理模块 class FileHandler: @staticmethod def scan_txt_files(folder_path): @@ -190,18 +492,16 @@ class FileHandler: raise Exception(f"TXT文件夹不存在: {folder_path}") txt_files = [] - # 递归扫描所有TXT文件 for root, dirs, files in os.walk(folder_path): for file in files: if file.lower().endswith(".txt"): txt_path = os.path.join(root, file) - # 获取文件名(不含扩展名) file_name = os.path.splitext(file)[0] txt_files.append({ "path": txt_path, "name": file_name, "relative_path": os.path.relpath(txt_path, folder_path), - "folder": root # 存储文件所在的文件夹 + "folder": root }) if not txt_files: @@ -215,7 +515,6 @@ class FileHandler: if not os.path.isdir(images_root): raise Exception(f"图片根文件夹不存在: {images_root}") - # 获取所有图片文件夹 all_image_folders = [] for root, dirs, _ in os.walk(images_root): for dir in dirs: @@ -226,7 +525,6 @@ class FileHandler: "relative_path": os.path.relpath(folder_path, images_root) }) - # 为每个TXT文件匹配图片文件夹 matched_pairs = [] for txt in txt_files: matches = [] @@ -235,7 +533,6 @@ class FileHandler: for img_folder in all_image_folders: folder_name = img_folder["name"].lower() - # 根据匹配模式查找匹配项 if config.match_pattern == "exact" and txt_name == folder_name: matches.append(img_folder) elif config.match_pattern == "prefix" and folder_name.startswith(txt_name): @@ -243,7 +540,6 @@ class FileHandler: elif config.match_pattern == "contains" and txt_name in folder_name: matches.append(img_folder) - # 优先选择相对路径最短的匹配项 if matches: matches.sort(key=lambda x: len(x["relative_path"])) matched_pairs.append({ @@ -266,14 +562,12 @@ class FileHandler: if not folder_path or not os.path.isdir(folder_path): return [] - # 仅保留常见图片格式 - image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.gif'] + image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.gif', '*.webp', '*.tiff'] image_files = [] for ext in image_extensions: image_files.extend(glob.glob(os.path.join(folder_path, ext))) - # 根据配置排序 if config.image_sort_by == "name": image_files.sort() elif config.image_sort_by == "time": @@ -287,20 +581,12 @@ class FileHandler: if not os.path.exists(file_path): raise Exception(f"TXT文件不存在: {file_path}") - # 尝试多种编码读取TXT文件 encodings = [config.txt_encoding, "gbk", "utf-16", "iso-8859-1"] for encoding in encodings: try: with open(file_path, 'r', encoding=encoding) as f: content = f.read() content = content.replace("\r\n", "\n").replace("\r", "\n") - - # 新增:根据配置决定是否转换文字顺序 - if config.reverse_text_order: - # 这里使用段落反转,如需字符反转可改为TextProcessor.reverse_text_order - content = replace_text(content) - # content = TextProcessor.reverse_paragraph_order(content) - return content except UnicodeDecodeError: continue @@ -309,23 +595,17 @@ class FileHandler: @staticmethod def prepare_output_path(txt_info, images_root, output_root): - """准备输出文件路径(不使用子文件夹)""" - # 根据配置决定输出位置 + """准备输出文件路径""" if config.output_location == "txt_folder": - # 输出到TXT文件所在的文件夹 base_folder = txt_info["folder"] else: - # 输出到指定的根文件夹 base_folder = output_root - # 确保输出文件夹存在 os.makedirs(base_folder, exist_ok=True) - # 生成输出文件名 txt_name = txt_info["name"] output_path = os.path.join(base_folder, f"{txt_name}.docx") - # 处理文件名冲突 counter = 1 while os.path.exists(output_path): output_path = os.path.join(base_folder, f"{txt_name}_{counter}.docx") @@ -334,142 +614,6 @@ class FileHandler: return output_path -# Markdown解析模块 -class MarkdownParser: - @staticmethod - def parse(txt_content): - """解析TXT中的Markdown内容""" - headings = [] - current_heading = None - current_paragraphs = [] - in_code_block = False - - lines = txt_content.split('\n') - for line in lines: - line = line.rstrip('\r') - - # 处理代码块 - if line.startswith('```'): - in_code_block = not in_code_block - continue - - if in_code_block: - continue - - # 识别标题 - if line.lstrip().startswith('#'): - level = 0 - stripped_line = line.lstrip() - while level < len(stripped_line) and stripped_line[level] == '#' and level < config.title_levels: - level += 1 - - if level > 0 and (len(stripped_line) <= level or stripped_line[level] in (' ', '\t')): - if current_heading: - current_heading['paragraphs'] = current_paragraphs - headings.append(current_heading) - - heading_text = stripped_line[level:].lstrip() - current_heading = { - 'level': level, - 'content': heading_text, - 'paragraphs': [] - } - current_paragraphs = [] - continue - - # 处理无标题内容 - if current_heading is None: - current_heading = { - 'level': 0, - 'content': '前置内容', - 'paragraphs': [] - } - - # 处理段落 - if line.strip() == '': - if current_paragraphs and current_paragraphs[-1]['content'].strip() != '': - current_paragraphs.append({ - 'type': 'empty', - 'content': '', - 'is_first': False, - 'formatting': {} - }) - else: - para_type = 'text' - if line.startswith(('- ', '* ')): - para_type = 'unordered_list' - elif line.lstrip()[0].isdigit() and line.lstrip()[1:3] in ('. ', ') '): - para_type = 'ordered_list' - elif line.startswith('> '): - para_type = 'quote' - - is_first = len(current_paragraphs) == 0 and not any(p['type'] == 'text' for p in current_paragraphs) - formatting = MarkdownParser.extract_formatting(line) - - current_paragraphs.append({ - 'type': para_type, - 'content': line, - 'is_first': is_first, - 'formatting': formatting - }) - - if current_heading: - current_heading['paragraphs'] = current_paragraphs - headings.append(current_heading) - - return headings - - @staticmethod - def extract_formatting(text): - """提取文本格式""" - formatting = { - 'bold': [], - 'italic': [], - 'code': [] - } - - # 提取粗体 - start = 0 - while start < len(text): - pos = text.find('**', start) - if pos == -1: - break - end = text.find('**', pos + 2) - if end == -1: - break - formatting['bold'].append((pos, end + 2)) - start = end + 2 - - # 提取斜体 - start = 0 - while start < len(text): - pos = text.find('*', start) - if pos == -1: - break - if pos > 0 and text[pos - 1] == '*': - start = pos + 1 - continue - end = text.find('*', pos + 1) - if end == -1: - break - formatting['italic'].append((pos, end + 1)) - start = end + 1 - - # 提取代码 - start = 0 - while start < len(text): - pos = text.find('`', start) - if pos == -1: - break - end = text.find('`', pos + 1) - if end == -1: - break - formatting['code'].append((pos, end + 1)) - start = end + 1 - - return formatting - - # 图片处理模块 class ImageProcessor: @staticmethod @@ -518,87 +662,76 @@ class ImageProcessor: else: return WD_ALIGN_PARAGRAPH.CENTER + DISCLAIMER_TEXT = """[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。""" -# DOCX生成模块 + +# DOCX生成模块 - 完全重构 class DocxGenerator: @staticmethod - def generate(headings, image_files, output_path, progress_callback=None): - """生成DOCX文档""" + def generate(sections, image_files, output_path, progress_callback=None): + """生成DOCX文档 - 重构版本""" doc = Document() - total_headings = len(headings) + total_sections = len(sections) image_index = 0 image_count = len(image_files) - for i, heading in enumerate(headings): + for i, section in enumerate(sections): if progress_callback: - progress = int((i / total_headings) * 100) - progress_callback(progress, f"处理标题: {heading['content'][:30]}...") + progress = int((i / total_sections) * 100) + progress_callback(progress, f"处理章节: {section['content'][:30]}...") # 添加标题 - if heading['level'] > 0 and heading['level'] <= config.title_levels: - doc.add_heading(heading['content'], level=heading['level']) - else: - para = doc.add_paragraph(heading['content']) - run = para.runs[0] + if section['level'] > 0 and section['level'] <= config.title_levels: + heading_text = TextProcessor.process_text_content(section['content']) + doc.add_heading(heading_text, level=section['level']) + elif section['content'] != '前置内容': + heading_text = TextProcessor.process_text_content(section['content']) + para = doc.add_paragraph() + run = para.add_run(heading_text) run.font.size = Pt(14) run.font.bold = True para.space_after = Pt(12) - # 处理段落 - paragraphs = heading['paragraphs'] - if not paragraphs: + # 处理章节中的元素 + elements = section.get('elements', []) + if not elements: continue - # 处理第一段 - first_para = paragraphs[0] - DocxGenerator.add_formatted_paragraph(doc, first_para) + # 处理第一个非空元素后插入图片 + first_content_added = False - # 插入图片 - if image_count > 0 and image_index < image_count: - try: - img, width = ImageProcessor.process_image(image_files[image_index]) + for element in elements: + # 添加元素到文档 + DocxGenerator.add_element_to_doc(doc, element) - temp_img_path = None - if config.image_resize == "width": - temp_dir = os.path.dirname(output_path) - os.makedirs(temp_dir, exist_ok=True) - temp_img_path = os.path.join(temp_dir, f"temp_img_{image_index}.png") - img.save(temp_img_path) - img_path = temp_img_path - else: - img_path = image_files[image_index] + # 在第一个内容元素后插入图片 + if not first_content_added and element['type'] not in ['empty']: + first_content_added = True - para = doc.add_picture(img_path, width=Inches(width)) - para.alignment = ImageProcessor.get_image_alignment() + # 插入图片 + if image_count > 0 and image_index < image_count: + try: + DocxGenerator.insert_image(doc, image_files[image_index], output_path) + image_index += 1 - if temp_img_path and os.path.exists(temp_img_path): - os.remove(temp_img_path) + if image_index >= image_count: + if config.image_strategy == "cycle": + image_index = 0 + elif config.image_strategy == "truncate": + image_index = image_count - image_index += 1 + except Exception as e: + doc.add_paragraph(f"[图片插入失败: {str(e)}]") - if image_index >= image_count: - if config.image_strategy == "cycle": - image_index = 0 - elif config.image_strategy == "truncate": - image_index = image_count - - except Exception as e: - doc.add_paragraph(f"[图片插入失败: {str(e)}]") - - # 添加剩余段落 - for para in paragraphs[1:]: - DocxGenerator.add_formatted_paragraph(doc, para) - - # 新增:在文档末尾添加免责声明 - if config.add_disclaimer: - # 添加分隔线 - doc.add_paragraph("---") - # 添加免责声明段落 - para = doc.add_paragraph() - run = para.add_run(DISCLAIMER_TEXT) - run.font.size = Pt(10) # 可设置较小字体 - para.paragraph_format.line_spacing = 1.0 # 紧凑行距 + # 添加免责声明 + if config.add_disclaimer: + doc.add_paragraph("---") + para = doc.add_paragraph() + disclaimer_text = TextProcessor.process_text_content(DISCLAIMER_TEXT) + run = para.add_run(disclaimer_text) + run.font.size = Pt(10) + para.paragraph_format.line_spacing = 1.0 try: doc.save(output_path) @@ -609,77 +742,156 @@ class DocxGenerator: raise Exception(f"保存DOCX失败: {str(e)}") @staticmethod - def add_formatted_paragraph(doc, paragraph_data): - """添加带格式的段落""" - content = paragraph_data['content'] - para_type = paragraph_data['type'] - formatting = paragraph_data['formatting'] + def add_element_to_doc(doc, element): + """将解析的元素添加到文档中""" + element_type = element['type'] - # 新增:处理标点符号替换 - if config.replace_punctuation: - content = TextProcessor.replace_periods(content) + if element_type == 'paragraph': + DocxGenerator.add_formatted_paragraph(doc, element['content']) - if para_type == 'unordered_list': + elif element_type == 'unordered_list': para = doc.add_paragraph(style='List Bullet') - text = content[2:].strip() - elif para_type == 'ordered_list': + DocxGenerator.apply_inline_formatting(para, element['content']) + + elif element_type == 'ordered_list': para = doc.add_paragraph(style='List Number') - if '.' in content[:5]: - text = content.split('.', 1)[1].strip() - elif ')' in content[:5]: - text = content.split(')', 1)[1].strip() - else: - text = content.strip() - elif para_type == 'quote': - para = doc.add_paragraph(style='Intense Quote') - text = content[2:].strip() - elif para_type == 'empty': + DocxGenerator.apply_inline_formatting(para, element['content']) + + elif element_type == 'blockquote': + para = doc.add_paragraph() + para.style = 'Intense Quote' + DocxGenerator.apply_inline_formatting(para, element['content']) + + elif element_type == 'code_block': + para = doc.add_paragraph() + run = para.add_run(element['content']) + run.font.name = 'Courier New' + run.font.size = Pt(10) + para.style = 'No Spacing' + + elif element_type == 'table': + DocxGenerator.add_table_to_doc(doc, element['rows']) + + elif element_type == 'horizontal_rule': + para = doc.add_paragraph() + para.add_run("―" * 50) + para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + elif element_type == 'empty': + doc.add_paragraph() + + @staticmethod + def add_table_to_doc(doc, rows): + """添加表格到文档""" + if not rows: + return + + table = doc.add_table(rows=len(rows), cols=len(rows[0])) + table.style = 'Table Grid' + + for i, row_data in enumerate(rows): + row_cells = table.rows[i].cells + for j, cell_data in enumerate(row_data): + if j < len(row_cells): + # 处理单元格内容的格式和文字处理 + processed_text = TextProcessor.process_text_content(cell_data) + row_cells[j].text = processed_text + + @staticmethod + def insert_image(doc, image_path, output_path): + """插入图片到文档""" + img, width = ImageProcessor.process_image(image_path) + + temp_img_path = None + if config.image_resize == "width": + temp_dir = os.path.dirname(output_path) + os.makedirs(temp_dir, exist_ok=True) + temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png") + img.save(temp_img_path) + img_path = temp_img_path + else: + img_path = image_path + + para = doc.add_paragraph() + run = para.runs[0] if para.runs else para.add_run() + run.add_picture(img_path, width=Inches(width)) + para.alignment = ImageProcessor.get_image_alignment() + + if temp_img_path and os.path.exists(temp_img_path): + try: + os.remove(temp_img_path) + except: + pass # 忽略删除临时文件的错误 + + @staticmethod + def add_formatted_paragraph(doc, content): + """添加带格式的段落""" + if not content or not content.strip(): doc.add_paragraph() return - else: - para = doc.add_paragraph() - text = content.strip() - DocxGenerator.apply_formatting(para, text, formatting) + para = doc.add_paragraph() + DocxGenerator.apply_inline_formatting(para, content) if config.line_spacing > 0: para.paragraph_format.line_spacing = config.line_spacing @staticmethod - def apply_formatting(paragraph, text, formatting): - """应用文本格式""" - format_positions = [] - for pos in formatting['bold']: - format_positions.append((pos[0], pos[1], 'bold')) - for pos in formatting['italic']: - format_positions.append((pos[0], pos[1], 'italic')) - for pos in formatting['code']: - format_positions.append((pos[0], pos[1], 'code')) + def apply_inline_formatting(paragraph, text): + """应用行内格式到段落""" + # 首先处理文字内容(顺序调换和标点符号替换) + processed_text = TextProcessor.process_text_content(text) - format_positions.sort(key=lambda x: x[0]) + # 重新提取格式信息(因为文字可能已经改变) + formatting = MarkdownParser.extract_inline_formatting(processed_text) + + # 如果没有格式,直接添加文本 + if not formatting: + paragraph.add_run(processed_text) + return current_pos = 0 - for start, end, fmt_type in format_positions: - if start > current_pos: - paragraph.add_run(text[current_pos:start]) - run = paragraph.add_run(text[start:end]) + for fmt in formatting: + # 添加格式前的普通文本 + if fmt['start'] > current_pos: + paragraph.add_run(processed_text[current_pos:fmt['start']]) - if fmt_type == 'bold': - run.text = run.text[2:-2] + # 创建格式化的run + if fmt['type'] == 'bold': + # 移除markdown标记并应用格式 + clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']]) + run = paragraph.add_run(clean_text) run.bold = True - elif fmt_type == 'italic': - run.text = run.text[1:-1] + + elif fmt['type'] == 'italic': + clean_text = re.sub(r'(? 0: sample_output = FileHandler.prepare_output_path(matched_pairs[0]['txt'], "", output_root) main_output_folder = os.path.dirname(sample_output) @@ -821,7 +1030,6 @@ def show_config_window(): else: config.match_pattern = "contains" - # 保存输出位置设置 config.output_location = "txt_folder" if values['output_txt_folder'] else "custom" config.image_sort_by = "name" if values['sort_name'] else "time" config.image_resize = "none" if values['resize_none'] else "width" @@ -848,9 +1056,7 @@ def show_config_window(): else: config.image_strategy = "repeat_last" - # 新增:保存配置到文件 config.save_to_file(CONFIG_FILE_PATH) - break window.close() @@ -859,7 +1065,6 @@ def show_config_window(): # 匹配编辑窗口 def show_matching_editor(matched_pairs, images_root): """显示匹配编辑窗口,允许手动调整匹配关系""" - # 获取所有可用的图片文件夹 all_image_folders = [] if os.path.isdir(images_root): for root, dirs, _ in os.walk(images_root): @@ -868,14 +1073,12 @@ def show_matching_editor(matched_pairs, images_root): rel_path = os.path.relpath(folder_path, images_root) all_image_folders.append((folder_path, rel_path)) - # 创建表格数据 table_data = [] for i, pair in enumerate(matched_pairs): txt_name = pair['txt']['name'] img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" table_data.append([i, txt_name, img_folder]) - # 窗口布局 layout = [ [sg.Text('文件匹配编辑', font=('bold', 14))], [sg.Text('选择要修改的项目,然后从右侧选择图片文件夹')], @@ -909,28 +1112,23 @@ def show_matching_editor(matched_pairs, images_root): if event in (sg.WIN_CLOSED, '应用所有'): break - # 表格选中事件 if event == '-TABLE-': if values['-TABLE-']: selected_row = values['-TABLE-'][0] - # 设置选中项 if event == '设置选中项' and selected_row is not None and values['-FOLDERS-']: folder_idx = [i for i, f in enumerate(all_image_folders) if f[1] == values['-FOLDERS-'][0]][0] folder_path, folder_rel = all_image_folders[folder_idx] - # 更新匹配数据 matched_pairs[selected_row]['image_folder'] = { "path": folder_path, "name": os.path.basename(folder_path), "relative_path": folder_rel } - # 更新表格 table_data[selected_row][2] = folder_rel window['-TABLE-'].update(values=table_data) - # 清除选中项 if event == '清除选中项' and selected_row is not None: matched_pairs[selected_row]['image_folder'] = None table_data[selected_row][2] = "无匹配" @@ -953,6 +1151,21 @@ def show_help_window(): 5. 查看匹配结果,可点击"编辑匹配"调整匹配关系 6. 点击"开始批量转换"生成DOCX文件 +支持的Markdown格式: +- 标题:# ## ### #### ##### ###### +- 粗体:**文字** 或 __文字__ +- 斜体:*文字* 或 _文字_ +- 行内代码:`代码` +- 代码块:```语言\\n代码\\n``` +- 删除线:~~文字~~ +- 链接:[链接文字](URL) +- 图片:![图片描述](图片路径) +- 无序列表:- 或 * 或 + +- 有序列表:1. 2. 3. +- 引用:> 引用内容 +- 表格:| 列1 | 列2 | +- 水平分隔线:--- 或 *** 或 ___ + 输出路径选择: - 输出到TXT文件所在文件夹: 每个DOCX文件会直接保存在对应TXT文件所在的文件夹中 - 输出到指定文件夹: 所有DOCX文件会直接保存在您指定的文件夹中 @@ -964,9 +1177,10 @@ def show_help_window(): 转换规则: - 每个小标题的第一段后会插入一张图片 -- 支持Markdown格式: 标题、列表、粗体、斜体、代码等 +- 先将Markdown格式转换为DOCX格式,再处理文字内容 +- 支持文字顺序调换和标点符号替换功能 """ - sg.popup_scrolled('使用帮助', help_text, size=(60, 20)) + sg.popup_scrolled('使用帮助', help_text, size=(70, 25)) # 结果窗口 @@ -1001,22 +1215,20 @@ def show_results_window(results): def main_window(): """主界面""" sg.theme('BlueMono') - - # 初始化变量 matched_pairs = [] layout = [ [sg.Text('批量Markdown TXT转DOCX工具', font=('bold', 16))], - [sg.Text('(按文件名匹配TXT文件和图片文件夹)', text_color='gray')], + [sg.Text('(按文件名匹配TXT文件和图片文件夹,支持完整Markdown格式)', text_color='gray')], [sg.HSeparator()], [sg.Text('TXT文件文件夹:', size=(15, 1)), - sg.InputText(key='txt_folder', enable_events=True), + sg.InputText(key='txt_folder', enable_events=True, default_text=config.last_txt_folder), sg.FolderBrowse('浏览')], [sg.Text('图片根文件夹:', size=(15, 1)), - sg.InputText(key='images_root', enable_events=True), + sg.InputText(key='images_root', enable_events=True, default_text=config.last_images_root), sg.FolderBrowse('浏览')], [sg.Text('输出根文件夹:', size=(15, 1)), - sg.InputText(key='output_root', enable_events=True), + sg.InputText(key='output_root', enable_events=True, default_text=config.last_output_root), sg.FolderBrowse('浏览'), sg.Text('(当选择"输出到指定文件夹"时有效)', text_color='gray')], [sg.Button('扫描文件', size=(12, 1)), @@ -1039,7 +1251,6 @@ def main_window(): [sg.Button('开始批量转换', size=(15, 1), disabled=True), sg.Button('退出')] ] - # 创建窗口 window = sg.Window('批量Markdown TXT转DOCX工具', layout, resizable=True) progress_bar = window['progress_bar'] status_text = window['status_text'] @@ -1055,7 +1266,6 @@ def main_window(): output_root_input.update(disabled=True) output_root_input.Widget.configure(foreground='gray') - # 先进行一次窗口读取来完成初始化,然后再更新元素状态 window.read(timeout=1) update_output_root_state() @@ -1063,7 +1273,6 @@ def main_window(): event, values = window.read() if event in (sg.WIN_CLOSED, '退出'): - # 只有在窗口未关闭时,才尝试读取 values if values is not None: config.last_txt_folder = values.get('txt_folder', '') config.last_images_root = values.get('images_root', '') @@ -1072,13 +1281,9 @@ def main_window(): break if event == '转换设置': - # 保存当前输出根目录 current_output_root = values['output_root'] - # 显示配置窗口 show_config_window() - # 更新输出根目录输入框状态 update_output_root_state() - # 恢复输出根目录值 window['output_root'].update(current_output_root) if event == '帮助': @@ -1096,7 +1301,6 @@ def main_window(): sg.popup_error('请选择图片根文件夹') continue - # 保存当前选择的文件夹路径 - 新增 config.last_txt_folder = txt_folder config.last_images_root = images_root if values['output_root']: @@ -1112,7 +1316,6 @@ def main_window(): window.refresh() matched_pairs = FileHandler.find_matching_image_folders(txt_files, images_root) - # 更新预览表格 table_data = [] for pair in matched_pairs: img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" @@ -1125,9 +1328,6 @@ def main_window(): preview_table.update(values=table_data) status_text.update(f'扫描完成: 找到 {len(matched_pairs)} 个TXT文件') - # 启用相关按钮 - window['-PREVIEW_TABLE-'].update(values=table_data) - window['编辑匹配'].update(disabled=False) window['开始批量转换'].update(disabled=False) @@ -1141,10 +1341,8 @@ def main_window(): sg.popup_error('请选择图片根文件夹') continue - # 打开匹配编辑窗口 matched_pairs = show_matching_editor(matched_pairs, images_root) - # 更新预览表格 table_data = [] for pair in matched_pairs: img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" @@ -1157,26 +1355,21 @@ def main_window(): preview_table.update(values=table_data) if event == '开始批量转换' and matched_pairs: - # 检查输出路径(当选择输出到指定文件夹时) if config.output_location == "custom" and not values['output_root']: sg.popup_error('请选择输出根文件夹(在"转换设置"中选择了"输出到指定文件夹")') continue try: - # 显示进度条 progress_bar.update(0, visible=True) status_text.update('开始批量转换...') window.refresh() - # 执行批量处理 def update_batch_progress(progress, text): progress_bar.update(progress) status_text.update(f'状态: {text}') window.refresh() results = BatchProcessor.process_batch(matched_pairs, values['output_root'], update_batch_progress) - - # 显示结果 show_results_window(results) status_text.update('状态: 批量转换完成') @@ -1184,9 +1377,7 @@ def main_window(): sg.popup_error(f'批量处理失败: {str(e)}') status_text.update('状态: 批量转换失败') - # 自动填充输出文件夹(如果未设置) if (event == 'txt_folder' or event == 'images_root') and values[event] and not values['output_root']: - # 使用TXT文件夹作为默认输出文件夹 default_output = values['txt_folder'] if values['txt_folder'] else values['images_root'] window['output_root'].update(default_output) @@ -1195,4 +1386,4 @@ def main_window(): # 程序入口 if __name__ == '__main__': - main_window() + main_window() \ No newline at end of file