From 0c77c42addf7f4a17d1150d44be084e1b3482cf5 Mon Sep 17 00:00:00 2001 From: taiyi Date: Wed, 10 Sep 2025 10:35:03 +0800 Subject: [PATCH] =?UTF-8?q?=E7=AC=AC=E4=B8=80=E6=AC=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/TxT2DOCX.iml | 8 + Txt2docx2.py | 1635 +++++++++++++++++++++++++++++++++++++++++ data/error_chars.json | 116 +++ replacestr.py | 473 ++++++++++++ 4 files changed, 2232 insertions(+) create mode 100644 .idea/TxT2DOCX.iml create mode 100644 Txt2docx2.py create mode 100644 data/error_chars.json create mode 100644 replacestr.py diff --git a/.idea/TxT2DOCX.iml b/.idea/TxT2DOCX.iml new file mode 100644 index 0000000..8437fe6 --- /dev/null +++ b/.idea/TxT2DOCX.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/Txt2docx2.py b/Txt2docx2.py new file mode 100644 index 0000000..9a4de76 --- /dev/null +++ b/Txt2docx2.py @@ -0,0 +1,1635 @@ +import os +import sys +import glob +import re +import random +import json +from typing import Tuple, List +from PIL import Image +from docx import Document +from docx.shared import Inches, Pt, RGBColor +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.enum.style import WD_STYLE_TYPE +import PySimpleGUI as sg +from replacestr import replace_text +import configparser + +CONFIG_FILE_PATH = os.path.join(os.path.expanduser("~"), ".txt2md2docx.ini") + + +# 错别字处理功能集成 +def load_error_chars(db_path: str = "data/error_chars.json") -> dict: + """加载错别字库""" + # 检查文件夹是否存在,不存在则创建 + dir_name = os.path.dirname(db_path) + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + # 检查文件是否存在,不存在则创建默认库 + if not os.path.exists(db_path): + default_chars = { + "的": "地", + "地": "得", + "得": "的", + "在": "再", + "再": "在", + "是": "事", + "事": "是", + "他": "她", + "她": "他", + "你": "您", + "您": "你", + "们": "门", + "门": "们", + "有": "又", + "又": "有", + "和": "合", + "合": "和", + "到": "倒", + "倒": "到", + "就": "才", + "才": "就", + "要": "耍", + "耍": "要", + "会": "汇", + "汇": "会", + "看": "着", + "着": "看", + "说": "讲", + "讲": "说", + "做": "作", + "作": "做", + "已": "己", + "己": "已", + "以": "已", + "已": "以", + "进": "近", + "近": "进", + "象": "像", + "像": "象", + "对": "队", + "队": "对", + "分": "份", + "份": "分", + } + + with open(db_path, 'w', encoding='utf-8') as f: + json.dump(default_chars, f, ensure_ascii=False, indent=2) + return default_chars + + # 加载已存在的错别字库 + with open(db_path, 'r', encoding='utf-8') as f: + return json.load(f) + + +def introduce_char_errors(text: str, intensity: float = 1.0, db_path: str = "data/error_chars.json") -> Tuple[ + str, int, List[str], List[str]]: + """ + 将文本中的正确单字替换为常见错误单字 + + 参数: + text: 要处理的文本 + intensity: 错误引入强度,0.0-1.0之间,1.0表示替换所有可能的字 + db_path: 错别字库文件路径 + + 返回: + 处理后的文本、替换的总数量、原句列表、处理后的句子列表 + """ + # 加载错别字库 + error_chars = load_error_chars(db_path) + + # 句子拆分函数 + def split_into_sentences(txt: str) -> List[str]: + separators = re.compile(r'([。!?;,.!?;])') + parts = separators.split(txt) + sentences = [] + for i in range(0, len(parts) - 1, 2): + if parts[i] or parts[i + 1]: + sentences.append(parts[i] + parts[i + 1]) + if len(parts) % 2 == 1 and parts[-1]: + sentences.append(parts[-1]) + return sentences + + # 单句错误引入函数 + def introduce_errors_to_sentence(sentence: str) -> Tuple[str, int]: + modified = list(sentence) + replace_count = 0 + for i, char in enumerate(modified): + if char in error_chars and random.random() <= intensity: + modified[i] = error_chars[char] + replace_count += 1 + return ''.join(modified), replace_count + + # 处理整个文本 + original_sentences = split_into_sentences(text) + modified_sentences = [] + total_replace = 0 + + for sentence in original_sentences: + modified, count = introduce_errors_to_sentence(sentence) + modified_sentences.append(modified) + total_replace += count + + modified_text = ''.join(modified_sentences) + return modified_text, total_replace, original_sentences, modified_sentences + + +# 配置设置 +class Config: + def __init__(self): + # 文件处理配置 + self.txt_encoding = "utf-8" + self.match_pattern = "exact" # exact: 完全匹配, prefix: 前缀匹配, contains: 包含 + self.output_location = "txt_folder" # txt_folder or custom + # 最近使用的文件夹路径 + self.last_txt_folder = "" + self.last_images_root = "" + self.last_output_root = "" + # 文字处理 + self.reverse_text_order = False # 转换文字顺序开关 + # 错别字处理配置 + self.enable_char_errors = False # 是否启用错别字处理 + self.char_error_intensity = 0.3 # 错别字强度 0.0-1.0 + self.char_error_db_path = "data/error_chars.json" # 错别字库路径 + # 图片处理配置 + self.image_sort_by = "name" # name or time + self.image_resize = "none" # none or width + self.image_width = 6 # 英寸 + self.image_alignment = "center" # left, center, right + self.image_strategy = "cycle" # cycle, truncate, repeat_last + # 文档格式配置 + self.line_spacing = 1.5 + self.title_levels = 6 # 支持的最大标题层级 + self.replace_punctuation = False # 是否替换标点符号 + self.add_disclaimer = False # 是否添加免责声明 + + def load_from_file(self, file_path): + if not os.path.exists(file_path): + return False + + config_parser = configparser.ConfigParser() + config_parser.read(file_path, encoding='utf-8') + + # 加载文件处理配置 + if 'FileHandling' in config_parser: + self.txt_encoding = config_parser.get('FileHandling', 'txt_encoding', fallback=self.txt_encoding) + self.match_pattern = config_parser.get('FileHandling', 'match_pattern', fallback=self.match_pattern) + self.output_location = config_parser.get('FileHandling', 'output_location', + fallback=self.output_location) + self.last_txt_folder = config_parser.get('FileHandling', 'last_txt_folder', + fallback=self.last_txt_folder) + self.last_images_root = config_parser.get('FileHandling', 'last_images_root', + fallback=self.last_images_root) + self.last_output_root = config_parser.get('FileHandling', 'last_output_root', + fallback=self.last_output_root) + + # 加载文字处理配置 + if 'TextProcessing' in config_parser: + self.reverse_text_order = config_parser.getboolean('TextProcessing', 'reverse_text_order', + fallback=self.reverse_text_order) + self.replace_punctuation = config_parser.getboolean('TextProcessing', 'replace_punctuation', + fallback=self.replace_punctuation) + self.add_disclaimer = config_parser.getboolean('TextProcessing', 'add_disclaimer', + fallback=self.add_disclaimer) + # 错别字处理配置 + self.enable_char_errors = config_parser.getboolean('TextProcessing', 'enable_char_errors', + fallback=self.enable_char_errors) + self.char_error_intensity = config_parser.getfloat('TextProcessing', 'char_error_intensity', + fallback=self.char_error_intensity) + self.char_error_db_path = config_parser.get('TextProcessing', 'char_error_db_path', + fallback=self.char_error_db_path) + + # 加载图片处理配置 + if 'ImageProcessing' in config_parser: + self.image_sort_by = config_parser.get('ImageProcessing', 'image_sort_by', fallback=self.image_sort_by) + self.image_resize = config_parser.get('ImageProcessing', 'image_resize', fallback=self.image_resize) + self.image_width = config_parser.getfloat('ImageProcessing', 'image_width', fallback=self.image_width) + self.image_alignment = config_parser.get('ImageProcessing', 'image_alignment', + fallback=self.image_alignment) + self.image_strategy = config_parser.get('ImageProcessing', 'image_strategy', + fallback=self.image_strategy) + + # 加载文档格式配置 + if 'DocumentFormat' in config_parser: + self.line_spacing = config_parser.getfloat('DocumentFormat', 'line_spacing', fallback=self.line_spacing) + self.title_levels = config_parser.getint('DocumentFormat', 'title_levels', fallback=self.title_levels) + + return True + + def save_to_file(self, file_path): + config_parser = configparser.ConfigParser() + + # 保存文件处理配置 + config_parser['FileHandling'] = { + 'txt_encoding': self.txt_encoding, + 'match_pattern': self.match_pattern, + 'output_location': self.output_location, + 'last_txt_folder': self.last_txt_folder, + 'last_images_root': self.last_images_root, + 'last_output_root': self.last_output_root + } + + # 保存文字处理配置 + config_parser['TextProcessing'] = { + 'reverse_text_order': str(self.reverse_text_order), + 'replace_punctuation': str(self.replace_punctuation), + 'add_disclaimer': str(self.add_disclaimer), + 'enable_char_errors': str(self.enable_char_errors), + 'char_error_intensity': str(self.char_error_intensity), + 'char_error_db_path': self.char_error_db_path + } + + # 保存图片处理配置 + config_parser['ImageProcessing'] = { + 'image_sort_by': self.image_sort_by, + 'image_resize': self.image_resize, + 'image_width': str(self.image_width), + 'image_alignment': self.image_alignment, + 'image_strategy': self.image_strategy + } + + # 保存文档格式配置 + config_parser['DocumentFormat'] = { + 'line_spacing': str(self.line_spacing), + 'title_levels': str(self.title_levels) + } + + with open(file_path, 'w', encoding='utf-8') as f: + config_parser.write(f) + + return True + + +# 全局配置实例 +config = Config() +config.load_from_file(CONFIG_FILE_PATH) + + +# 文字处理工具类 - 增强功能 +class TextProcessor: + @staticmethod + def replace_periods(text: str) -> str: + """ + 将中间出现的句号统一替换为逗号; + 若文本末尾是句号,则直接删除该句号。 + """ + text = text.rstrip() + if not text: + return '' + + # 去掉末尾句号(如果有) + if text[-1] == '。': + text = text[:-1] + + # 把剩余句号替换为逗号 + return text.replace('。', ',') + + @staticmethod + def reverse_text_order(content): + """反转文本顺序(按字符级反转)""" + if not content: + return content + return content[::-1] + + @staticmethod + def reverse_paragraph_order(content): + """反转段落顺序(保留段落内文字顺序)""" + if not content: + return content + paragraphs = content.split('\n') + return '\n'.join(reversed(paragraphs)) + + @staticmethod + def apply_char_errors(text: str) -> str: + """应用错别字处理""" + if not config.enable_char_errors or not text: + return text + + try: + modified_text, replace_count, _, _ = introduce_char_errors( + text, + config.char_error_intensity, + config.char_error_db_path + ) + print(f"已应用错别字处理,替换了 {replace_count} 个字符。") + return modified_text + except Exception as e: + # 如果错别字处理出错,返回原文本 + print(f"错别字处理出错: {e}") + return text + + @staticmethod + def process_text_content(text): + """统一处理文字内容:顺序调换、错别字处理和标点符号替换""" + if not text or not text.strip(): + return text + + # 先进行文字顺序处理 + if config.reverse_text_order: + text = replace_text(text) + + # 应用错别字处理 + text = TextProcessor.apply_char_errors(text) + + # 最后进行标点符号替换 + if config.replace_punctuation: + text = TextProcessor.replace_periods(text) + + return text + + +# 增强的Markdown解析器 +class MarkdownParser: + # Markdown格式匹配模式 + PATTERNS = { + 'heading': re.compile(r'^(\s*)(#{1,6})\s+(.+)$'), + 'bold_asterisk': re.compile(r'\*\*(.+?)\*\*'), + 'bold_underscore': re.compile(r'__(.+?)__'), + 'italic_asterisk': re.compile(r'(?\s*(.+)$'), + 'horizontal_rule': re.compile(r'^(\s*[-*_]){3,}\s*$'), + 'table_row': re.compile(r'^\|(.+)\|$'), + 'table_separator': re.compile(r'^\|(\s*:?-+:?\s*\|)+$') + } + + @staticmethod + def parse(txt_content): + """解析Markdown内容为结构化数据""" + elements = [] + lines = txt_content.split('\n') + i = 0 + current_section = None + in_code_block = False + code_block_content = [] + table_mode = False + table_rows = [] + + while i < len(lines): + line = lines[i].rstrip('\r') + original_line = line + + # 处理代码块 + if line.strip().startswith('```'): + if not in_code_block: + in_code_block = True + language = line.strip()[3:].strip() + code_block_content = [] + i += 1 + continue + else: + in_code_block = False + elements.append({ + 'type': 'code_block', + 'language': language if 'language' in locals() else '', + 'content': '\n'.join(code_block_content), + 'level': 0 + }) + code_block_content = [] + i += 1 + continue + + if in_code_block: + code_block_content.append(line) + i += 1 + continue + + # 处理表格 + table_match = MarkdownParser.PATTERNS['table_row'].match(line) + table_sep_match = MarkdownParser.PATTERNS['table_separator'].match(line) + + if table_match or table_sep_match: + if not table_mode: + table_mode = True + table_rows = [] + + if table_match and not table_sep_match: + cells = [cell.strip() for cell in table_match.group(1).split('|')] + table_rows.append(cells) + + i += 1 + continue + elif table_mode: + # 表格结束 + if table_rows: + elements.append({ + 'type': 'table', + 'rows': table_rows, + 'level': 0 + }) + table_mode = False + table_rows = [] + + # 处理标题 + heading_match = MarkdownParser.PATTERNS['heading'].match(line) + if heading_match: + level = len(heading_match.group(2)) + if level <= config.title_levels: + # 提取标题文本(可能包含粗体等格式) + heading_text = heading_match.group(3).strip() + # 先移除Markdown标记但保留文本内容 + cleaned_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', heading_text) + elements.append({ + 'type': 'heading', + 'level': level, + 'content': heading_text, # 保留原始内容用于格式处理 + 'cleaned_content': cleaned_text # 用于显示的纯文本 + }) + current_section = elements[-1] + current_section['paragraphs'] = [] + i += 1 + continue + + # 处理水平分隔线 + if MarkdownParser.PATTERNS['horizontal_rule'].match(line): + elements.append({ + 'type': 'horizontal_rule', + 'level': 0 + }) + i += 1 + continue + + # 处理列表 + ul_match = MarkdownParser.PATTERNS['unordered_list'].match(line) + ol_match = MarkdownParser.PATTERNS['ordered_list'].match(line) + + if ul_match: + elements.append({ + 'type': 'unordered_list', + 'content': ul_match.group(1), + 'level': 0 + }) + i += 1 + continue + + if ol_match: + elements.append({ + 'type': 'ordered_list', + 'content': ol_match.group(1), + 'level': 0 + }) + i += 1 + continue + + # 处理引用 + quote_match = MarkdownParser.PATTERNS['blockquote'].match(line) + if quote_match: + elements.append({ + 'type': 'blockquote', + 'content': quote_match.group(1), + 'level': 0 + }) + i += 1 + continue + + # 处理空行 + if line.strip() == '': + elements.append({ + 'type': 'empty', + 'content': '', + 'level': 0 + }) + i += 1 + continue + + # 处理普通段落 + elements.append({ + 'type': 'paragraph', + 'content': line, + 'level': 0 + }) + + i += 1 + + # 处理剩余的表格 + if table_mode and table_rows: + elements.append({ + 'type': 'table', + 'rows': table_rows, + 'level': 0 + }) + + return MarkdownParser.group_by_sections(elements) + + @staticmethod + def group_by_sections(elements): + """将解析的元素按标题分组""" + sections = [] + current_section = { + 'type': 'section', + 'level': 0, + 'content': '前置内容', + 'elements': [] + } + + for element in elements: + if element['type'] == 'heading': + # 保存当前section + if current_section['elements']: + sections.append(current_section) + + # 创建新section + current_section = { + 'type': 'section', + 'level': element['level'], + 'content': element['content'], + 'elements': [] + } + else: + current_section['elements'].append(element) + + # 添加最后一个section + if current_section['elements']: + sections.append(current_section) + + return sections + + @staticmethod + def extract_inline_formatting(text): + """提取行内格式信息""" + formatting = [] + + # 提取粗体 (**) + for match in MarkdownParser.PATTERNS['bold_asterisk'].finditer(text): + formatting.append({ + 'type': 'bold', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取粗体 (__) + for match in MarkdownParser.PATTERNS['bold_underscore'].finditer(text): + formatting.append({ + 'type': 'bold', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取斜体 (*) + for match in MarkdownParser.PATTERNS['italic_asterisk'].finditer(text): + # 检查是否与粗体重叠 + overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end'] + for f in formatting if f['type'] == 'bold') + if not overlaps: + formatting.append({ + 'type': 'italic', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取斜体 (_) + for match in MarkdownParser.PATTERNS['italic_underscore'].finditer(text): + overlaps = any(f['start'] <= match.start() < f['end'] or f['start'] < match.end() <= f['end'] + for f in formatting if f['type'] in ['bold', 'italic']) + if not overlaps: + formatting.append({ + 'type': 'italic', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取行内代码 + for match in MarkdownParser.PATTERNS['code_inline'].finditer(text): + formatting.append({ + 'type': 'code', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取删除线 + for match in MarkdownParser.PATTERNS['strikethrough'].finditer(text): + formatting.append({ + 'type': 'strikethrough', + 'start': match.start(), + 'end': match.end(), + 'content': match.group(1) + }) + + # 提取链接 + for match in MarkdownParser.PATTERNS['link'].finditer(text): + formatting.append({ + 'type': 'link', + 'start': match.start(), + 'end': match.end(), + 'text': match.group(1), + 'url': match.group(2) + }) + + # 按位置排序 + formatting.sort(key=lambda x: x['start']) + return formatting + + +# 文件处理模块 +class FileHandler: + @staticmethod + def scan_txt_files(folder_path): + """扫描文件夹中的所有TXT文件""" + if not os.path.isdir(folder_path): + raise Exception(f"TXT文件夹不存在: {folder_path}") + + txt_files = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.lower().endswith(".txt"): + txt_path = os.path.join(root, file) + file_name = os.path.splitext(file)[0] + txt_files.append({ + "path": txt_path, + "name": file_name, + "relative_path": os.path.relpath(txt_path, folder_path), + "folder": root + }) + + if not txt_files: + raise Exception(f"在 {folder_path} 中未找到任何TXT文件") + + return sorted(txt_files, key=lambda x: x["relative_path"]) + + @staticmethod + def find_matching_image_folders(txt_files, images_root): + """根据TXT文件名匹配图片文件夹""" + if not os.path.isdir(images_root): + raise Exception(f"图片根文件夹不存在: {images_root}") + + all_image_folders = [] + for root, dirs, _ in os.walk(images_root): + for dir in dirs: + folder_path = os.path.join(root, dir) + all_image_folders.append({ + "path": folder_path, + "name": dir, + "relative_path": os.path.relpath(folder_path, images_root) + }) + + matched_pairs = [] + for txt in txt_files: + matches = [] + txt_name = txt["name"].lower() + + for img_folder in all_image_folders: + folder_name = img_folder["name"].lower() + + if config.match_pattern == "exact" and txt_name == folder_name: + matches.append(img_folder) + elif config.match_pattern == "prefix" and folder_name.startswith(txt_name): + matches.append(img_folder) + elif config.match_pattern == "contains" and txt_name in folder_name: + matches.append(img_folder) + + if matches: + matches.sort(key=lambda x: len(x["relative_path"])) + matched_pairs.append({ + "txt": txt, + "image_folder": matches[0], + "all_matches": matches + }) + else: + matched_pairs.append({ + "txt": txt, + "image_folder": None, + "all_matches": [] + }) + + return matched_pairs + + @staticmethod + def get_image_files(folder_path): + """获取文件夹中的所有图片文件""" + if not folder_path or not os.path.isdir(folder_path): + return [] + + image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.gif', '*.webp', '*.tiff'] + image_files = [] + + for ext in image_extensions: + image_files.extend(glob.glob(os.path.join(folder_path, ext))) + + if config.image_sort_by == "name": + image_files.sort() + elif config.image_sort_by == "time": + image_files.sort(key=lambda x: os.path.getmtime(x)) + + return image_files + + @staticmethod + def read_markdown_txt(file_path): + """读取含Markdown内容的TXT文件""" + if not os.path.exists(file_path): + raise Exception(f"TXT文件不存在: {file_path}") + + encodings = [config.txt_encoding, "gbk", "utf-16", "iso-8859-1"] + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + content = f.read() + content = content.replace("\r\n", "\n").replace("\r", "\n") + return content + except UnicodeDecodeError: + continue + + raise Exception(f"无法解析TXT文件(编码问题): {file_path}") + + @staticmethod + def prepare_output_path(txt_info, images_root, output_root): + """准备输出文件路径""" + if config.output_location == "txt_folder": + base_folder = txt_info["folder"] + else: + base_folder = output_root + + os.makedirs(base_folder, exist_ok=True) + + txt_name = txt_info["name"] + output_path = os.path.join(base_folder, f"{txt_name}.docx") + + counter = 1 + while os.path.exists(output_path): + output_path = os.path.join(base_folder, f"{txt_name}_{counter}.docx") + counter += 1 + + return output_path + + +# 图片处理模块 +class ImageProcessor: + @staticmethod + def process_image(image_path): + """处理图片""" + try: + with Image.open(image_path) as img: + # 处理图片方向 + if hasattr(img, '_getexif'): + exif = img._getexif() + if exif: + orientation_tag = 274 + if orientation_tag in exif: + orientation = exif[orientation_tag] + if orientation == 3: + img = img.rotate(180, expand=True) + elif orientation == 6: + img = img.rotate(270, expand=True) + elif orientation == 8: + img = img.rotate(90, expand=True) + + # 调整大小 + if config.image_resize == "width" and config.image_width > 0: + target_width_px = config.image_width * 96 + width, height = img.size + + if width > target_width_px: + ratio = target_width_px / width + new_height = int(height * ratio) + img = img.resize((int(target_width_px), new_height), Image.LANCZOS) + + return img, config.image_width + else: + width_in = img.width / 96 + return img, width_in + except Exception as e: + raise Exception(f"处理图片失败 {image_path}: {str(e)}") + + @staticmethod + def get_image_alignment(): + """获取图片对齐方式""" + if config.image_alignment == "left": + return WD_ALIGN_PARAGRAPH.LEFT + elif config.image_alignment == "right": + return WD_ALIGN_PARAGRAPH.RIGHT + else: + return WD_ALIGN_PARAGRAPH.CENTER + + +DISCLAIMER_TEXT = """`[免责声明]文章的时间、过程、图片均来自于网络,文章旨在传播正能量,均无低俗等不良引导,请观众勿对号入座,并上升到人身攻击等方面。观众理性看待本事件,切勿留下主观臆断的恶意评论,互联网不是法外之地。本文如若真实性存在争议、事件版权或图片侵权问题,请及时联系作者,我们将予以删除。`""" + + +# DOCX生成模块 - 完全重构 +class DocxGenerator: + @staticmethod + def generate(sections, image_files, output_path, progress_callback=None): + """生成DOCX文档 - 重构版本""" + doc = Document() + total_sections = len(sections) + image_index = 0 + image_count = len(image_files) + + for i, section in enumerate(sections): + if progress_callback: + progress = int((i / total_sections) * 100) + progress_callback(progress, f"处理章节: {section['content'][:30]}...") + + # 添加标题 + if section['level'] > 0 and section['level'] <= config.title_levels: + # 使用原始带格式的内容进行处理 + heading_text = TextProcessor.process_text_content(section['content']) + # 创建标题段落 + para = doc.add_heading(level=section['level']) + # 应用行内格式(包括粗体) + DocxGenerator.apply_inline_formatting(para, heading_text) + elif section['content'] != '前置内容': + heading_text = TextProcessor.process_text_content(section['content']) + para = doc.add_paragraph() + run = para.add_run(heading_text) + run.font.size = Pt(14) + run.font.bold = True + para.space_after = Pt(12) + + # 处理章节中的元素 + elements = section.get('elements', []) + if not elements: + continue + + # 处理第一个非空元素后插入图片 + first_content_added = False + + for element in elements: + # 添加元素到文档 + DocxGenerator.add_element_to_doc(doc, element) + + # 在第一个内容元素后插入图片 + if not first_content_added and element['type'] not in ['empty']: + first_content_added = True + + # 插入图片 + if image_count > 0 and image_index < image_count: + try: + DocxGenerator.insert_image(doc, image_files[image_index], output_path) + image_index += 1 + + if image_index >= image_count: + if config.image_strategy == "cycle": + image_index = 0 + elif config.image_strategy == "truncate": + image_index = image_count + + except Exception as e: + doc.add_paragraph(f"[图片插入失败: {str(e)}]") + + # 添加免责声明 + if config.add_disclaimer: + doc.add_paragraph("---") + para = doc.add_paragraph() + disclaimer_text = TextProcessor.process_text_content(DISCLAIMER_TEXT) + run = para.add_run(disclaimer_text) + run.font.size = Pt(10) + para.paragraph_format.line_spacing = 1.0 + + try: + doc.save(output_path) + if progress_callback: + progress_callback(100, "转换完成!") + return True + except Exception as e: + raise Exception(f"保存DOCX失败: {str(e)}") + + @staticmethod + def add_element_to_doc(doc, element): + """将解析的元素添加到文档中""" + etype = element['type'] + content = TextProcessor.process_text_content(element.get('content', '')) + + if etype == 'paragraph': + DocxGenerator.add_formatted_paragraph(doc, content) + + elif etype == 'unordered_list': + para = doc.add_paragraph(style='List Bullet') + DocxGenerator.apply_inline_formatting(para, content) + + elif etype == 'ordered_list': + para = doc.add_paragraph(style='List Number') + DocxGenerator.apply_inline_formatting(para, content) + + elif etype == 'blockquote': + para = doc.add_paragraph(style='Quote') + DocxGenerator.apply_inline_formatting(para, content) + + elif etype == 'code_block': + para = doc.add_paragraph(style='No Spacing') + run = para.add_run(element['content']) + run.font.name = 'Courier New' + run.font.size = Pt(10) + + elif etype == 'table': + DocxGenerator.add_table_to_doc(doc, element['rows']) + + elif etype == 'horizontal_rule': + DocxGenerator.add_horizontal_rule(doc) + + elif etype == 'empty': + doc.add_paragraph() + + @staticmethod + def add_horizontal_rule(doc): + """在文档中添加横线""" + para = doc.add_paragraph() + run = para.add_run() + # 添加水平线条(使用下划线作为横线) + run.font.underline = True + run.text = " " * 100 # 足够长的下划线作为横线 + para.alignment = WD_ALIGN_PARAGRAPH.CENTER + + @staticmethod + def add_table_to_doc(doc, rows): + """添加表格到文档""" + if not rows: + return + + table = doc.add_table(rows=len(rows), cols=len(rows[0])) + table.style = 'Table Grid' + + for i, row_data in enumerate(rows): + row_cells = table.rows[i].cells + for j, cell_data in enumerate(row_data): + if j < len(row_cells): + # 处理单元格内容的格式和文字处理 + processed_text = TextProcessor.process_text_content(cell_data) + row_cells[j].text = processed_text + + @staticmethod + def insert_image(doc, image_path, output_path): + """插入图片到文档""" + img, width = ImageProcessor.process_image(image_path) + + temp_img_path = None + if config.image_resize == "width": + temp_dir = os.path.dirname(output_path) + os.makedirs(temp_dir, exist_ok=True) + temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png") + img.save(temp_img_path) + img_path = temp_img_path + else: + img_path = image_path + + para = doc.add_paragraph() + run = para.runs[0] if para.runs else para.add_run() + run.add_picture(img_path, width=Inches(width)) + para.alignment = ImageProcessor.get_image_alignment() + + if temp_img_path and os.path.exists(temp_img_path): + try: + os.remove(temp_img_path) + except: + pass # 忽略删除临时文件的错误 + + @staticmethod + def add_formatted_paragraph(doc, content): + """添加带格式的段落""" + if not content or not content.strip(): + doc.add_paragraph() + return + + para = doc.add_paragraph() + DocxGenerator.apply_inline_formatting(para, content) + + if config.line_spacing > 0: + para.paragraph_format.line_spacing = config.line_spacing + + @staticmethod + def apply_inline_formatting(paragraph, text): + """应用行内格式到段落""" + # 首先处理文字内容(顺序调换、错别字和标点符号替换) + processed_text = TextProcessor.process_text_content(text) + + # 重新提取格式信息(因为文字可能已经改变) + formatting = MarkdownParser.extract_inline_formatting(processed_text) + + # 如果没有格式,直接添加文本 + if not formatting: + paragraph.add_run(processed_text) + return + + current_pos = 0 + + for fmt in formatting: + # 添加格式前的普通文本 + if fmt['start'] > current_pos: + paragraph.add_run(processed_text[current_pos:fmt['start']]) + + # 创建格式化的run + if fmt['type'] == 'bold': + # 移除markdown标记并应用格式 + clean_text = re.sub(r'\*\*(.+?)\*\*|__(.+?)__', r'\1\2', processed_text[fmt['start']:fmt['end']]) + run = paragraph.add_run(clean_text) + run.bold = True + + elif fmt['type'] == 'italic': + clean_text = re.sub(r'(? 0: + sample_output = FileHandler.prepare_output_path(matched_pairs[0]['txt'], "", output_root) + main_output_folder = os.path.dirname(sample_output) + else: + main_output_folder = "" + + return { + "total": total, + "success": success_count, + "failed": len(failed_items), + "failed_items": failed_items, + "main_output_folder": main_output_folder + } + + +# 配置窗口 - 优化排版 +def show_config_window(): + """显示配置窗口 - 优化排版""" + # 创建标签页布局 + tab_file_layout = [ + [sg.Text('文件处理设置', font=('bold', 12))], + [sg.HSeparator()], + [sg.Text('TXT编码:', size=(12, 1)), + sg.Combo(['utf-8', 'gbk', 'utf-16'], default_value=config.txt_encoding, key='txt_encoding', size=(15, 1))], + [sg.Text('匹配模式:', size=(12, 1))], + [sg.Radio('完全匹配(文件名与文件夹名相同)', 'match', default=config.match_pattern == "exact", + key='match_exact')], + [sg.Radio('前缀匹配', 'match', default=config.match_pattern == "prefix", key='match_prefix')], + [sg.Radio('包含匹配', 'match', default=config.match_pattern == "contains", key='match_contains')], + [sg.HSeparator()], + [sg.Text('输出位置:', size=(12, 1))], + [sg.Radio('输出到TXT文件所在文件夹', 'output_loc', default=config.output_location == "txt_folder", + key='output_txt_folder')], + [sg.Radio('输出到指定文件夹', 'output_loc', default=config.output_location == "custom", key='output_custom')] + ] + + tab_text_layout = [ + [sg.Text('文字处理设置', font=('bold', 12))], + [sg.HSeparator()], + [sg.Checkbox('转换文字顺序', key='-REVERSE_TEXT-', default=config.reverse_text_order)], + [sg.Checkbox('替换标点符号(句号转逗号,保留结尾句号)', key='-REPLACE_PUNCTUATION-', + default=config.replace_punctuation)], + [sg.HSeparator()], + [sg.Text('错别字处理', font=('bold', 11), text_color='darkblue')], + [sg.Checkbox('启用错别字处理', key='-ENABLE_CHAR_ERRORS-', default=config.enable_char_errors, + enable_events=True)], + [sg.Text('错误强度:', size=(10, 1)), + sg.Slider(range=(0.0, 1.0), default_value=config.char_error_intensity, resolution=0.1, + orientation='h', size=(20, 15), key='char_error_intensity', disabled=not config.enable_char_errors)], + [sg.Text('错别字库路径:', size=(12, 1)), + sg.InputText(config.char_error_db_path, key='char_error_db_path', size=(30, 1), + disabled=not config.enable_char_errors), + sg.FileBrowse('浏览', file_types=(("JSON Files", "*.json"),), disabled=not config.enable_char_errors)], + [sg.HSeparator()], + [sg.Checkbox('添加免责声明', key='-ADD_DISCLAIMER-', default=config.add_disclaimer)] + ] + + tab_image_layout = [ + [sg.Text('图片处理设置', font=('bold', 12))], + [sg.HSeparator()], + [sg.Text('图片排序方式:', size=(12, 1))], + [sg.Radio('按名称', 'sort', default=config.image_sort_by == "name", key='sort_name'), + sg.Radio('按修改时间', 'sort', default=config.image_sort_by == "time", key='sort_time')], + [sg.HSeparator()], + [sg.Text('图片尺寸调整:', size=(12, 1))], + [sg.Radio('不调整', 'resize', default=config.image_resize == "none", key='resize_none')], + [sg.Radio('按宽度:', 'resize', default=config.image_resize == "width", key='resize_width'), + sg.InputText(str(config.image_width), size=(8, 1), key='image_width'), + sg.Text('英寸')], + [sg.HSeparator()], + [sg.Text('图片对齐方式:', size=(12, 1))], + [sg.Radio('左对齐', 'align', default=config.image_alignment == "left", key='align_left'), + sg.Radio('居中', 'align', default=config.image_alignment == "center", key='align_center'), + sg.Radio('右对齐', 'align', default=config.image_alignment == "right", key='align_right')], + [sg.HSeparator()], + [sg.Text('图片不足时策略:', size=(12, 1))], + [sg.Radio('循环使用', 'strategy', default=config.image_strategy == "cycle", key='strategy_cycle')], + [sg.Radio('忽略多余标题', 'strategy', default=config.image_strategy == "truncate", key='strategy_truncate')], + [sg.Radio('重复最后一张', 'strategy', default=config.image_strategy == "repeat_last", key='strategy_repeat')] + ] + + tab_format_layout = [ + [sg.Text('文档格式设置', font=('bold', 12))], + [sg.HSeparator()], + [sg.Text('行间距:', size=(12, 1)), + sg.InputText(str(config.line_spacing), size=(8, 1), key='line_spacing')], + [sg.Text('最大标题层级:', size=(12, 1)), + sg.Combo([1, 2, 3, 4, 5, 6], default_value=config.title_levels, key='title_levels', size=(8, 1))] + ] + + layout = [ + [sg.TabGroup([ + [sg.Tab('文件处理', tab_file_layout, key='tab_file')], + [sg.Tab('文字处理', tab_text_layout, key='tab_text')], + [sg.Tab('图片处理', tab_image_layout, key='tab_image')], + [sg.Tab('文档格式', tab_format_layout, key='tab_format')] + ])], + [sg.HSeparator()], + [sg.Button('确定', size=(10, 1)), sg.Button('取消', size=(10, 1)), sg.Button('重置为默认', size=(12, 1))] + ] + + window = sg.Window('转换设置', layout, modal=True, resizable=True, size=(500, 450)) + + while True: + event, values = window.read() + + if event in (sg.WIN_CLOSED, '取消'): + break + + # 处理错别字启用/禁用事件 + if event == '-ENABLE_CHAR_ERRORS-': + enabled = values['-ENABLE_CHAR_ERRORS-'] + window['char_error_intensity'].update(disabled=not enabled) + window['char_error_db_path'].update(disabled=not enabled) + + if event == '重置为默认': + # 重置为默认值 + default_config = Config() + window['txt_encoding'].update(default_config.txt_encoding) + window['match_exact'].update(True) + window['output_txt_folder'].update(True) + window['-REVERSE_TEXT-'].update(default_config.reverse_text_order) + window['-REPLACE_PUNCTUATION-'].update(default_config.replace_punctuation) + window['-ENABLE_CHAR_ERRORS-'].update(default_config.enable_char_errors) + window['char_error_intensity'].update(default_config.char_error_intensity) + window['char_error_db_path'].update(default_config.char_error_db_path) + window['-ADD_DISCLAIMER-'].update(default_config.add_disclaimer) + window['sort_name'].update(True) + window['resize_none'].update(True) + window['image_width'].update(str(default_config.image_width)) + window['align_center'].update(True) + window['strategy_cycle'].update(True) + window['line_spacing'].update(str(default_config.line_spacing)) + window['title_levels'].update(default_config.title_levels) + + if event == '确定': + # 保存配置 + config.txt_encoding = values['txt_encoding'] + + if values['match_exact']: + config.match_pattern = "exact" + elif values['match_prefix']: + config.match_pattern = "prefix" + else: + config.match_pattern = "contains" + + config.output_location = "txt_folder" if values['output_txt_folder'] else "custom" + config.image_sort_by = "name" if values['sort_name'] else "time" + config.image_resize = "none" if values['resize_none'] else "width" + config.reverse_text_order = values['-REVERSE_TEXT-'] + config.replace_punctuation = values['-REPLACE_PUNCTUATION-'] + config.add_disclaimer = values['-ADD_DISCLAIMER-'] + + # 错别字处理配置 + config.enable_char_errors = values['-ENABLE_CHAR_ERRORS-'] + config.char_error_intensity = values['char_error_intensity'] + config.char_error_db_path = values['char_error_db_path'] + + try: + config.image_width = float(values['image_width']) + except: + pass + + if values['align_left']: + config.image_alignment = "left" + elif values['align_right']: + config.image_alignment = "right" + else: + config.image_alignment = "center" + + if values['strategy_cycle']: + config.image_strategy = "cycle" + elif values['strategy_truncate']: + config.image_strategy = "truncate" + else: + config.image_strategy = "repeat_last" + + try: + config.line_spacing = float(values['line_spacing']) + config.title_levels = int(values['title_levels']) + except: + pass + + config.save_to_file(CONFIG_FILE_PATH) + break + + window.close() + + +# 匹配编辑窗口 +def show_matching_editor(matched_pairs, images_root): + """显示匹配编辑窗口,允许手动调整匹配关系""" + all_image_folders = [] + if os.path.isdir(images_root): + for root, dirs, _ in os.walk(images_root): + for dir in dirs: + folder_path = os.path.join(root, dir) + rel_path = os.path.relpath(folder_path, images_root) + all_image_folders.append((folder_path, rel_path)) + + table_data = [] + for i, pair in enumerate(matched_pairs): + txt_name = pair['txt']['name'] + img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" + table_data.append([i, txt_name, img_folder]) + + layout = [ + [sg.Text('文件匹配编辑', font=('bold', 14))], + [sg.Text('选择要修改的项目,然后从右侧选择图片文件夹')], + [ + sg.Table( + values=table_data, + headings=['序号', 'TXT文件名', '匹配的图片文件夹'], + key='-TABLE-', + select_mode=sg.TABLE_SELECT_MODE_BROWSE, + enable_events=True, + justification='left', + size=(None, 15) + ), + sg.VSeparator(), + sg.Listbox( + values=[f[1] for f in all_image_folders], + key='-FOLDERS-', + size=(40, 15), + enable_events=True + ) + ], + [sg.Button('设置选中项'), sg.Button('清除选中项'), sg.Button('应用所有')] + ] + + window = sg.Window('匹配编辑', layout, resizable=True) + selected_row = None + + while True: + event, values = window.read() + + if event in (sg.WIN_CLOSED, '应用所有'): + break + + if event == '-TABLE-': + if values['-TABLE-']: + selected_row = values['-TABLE-'][0] + + if event == '设置选中项' and selected_row is not None and values['-FOLDERS-']: + folder_idx = [i for i, f in enumerate(all_image_folders) if f[1] == values['-FOLDERS-'][0]][0] + folder_path, folder_rel = all_image_folders[folder_idx] + + matched_pairs[selected_row]['image_folder'] = { + "path": folder_path, + "name": os.path.basename(folder_path), + "relative_path": folder_rel + } + + table_data[selected_row][2] = folder_rel + window['-TABLE-'].update(values=table_data) + + if event == '清除选中项' and selected_row is not None: + matched_pairs[selected_row]['image_folder'] = None + table_data[selected_row][2] = "无匹配" + window['-TABLE-'].update(values=table_data) + + window.close() + return matched_pairs + + +# 帮助窗口 +def show_help_window(): + """显示帮助窗口""" + help_text = """ +批量Markdown TXT转DOCX工具使用说明: + +1. 选择包含Markdown内容的TXT文件所在文件夹 +2. 选择图片文件夹的根目录(程序会自动查找子文件夹) +3. 选择输出文件的保存根目录(当选择"输出到指定文件夹"时有效) +4. 点击"扫描文件"按钮,程序会自动匹配TXT文件和图片文件夹 +5. 查看匹配结果,可点击"编辑匹配"调整匹配关系 +6. 点击"开始批量转换"生成DOCX文件 + +支持的Markdown格式: +- 标题:# ## ### #### ##### ###### +- 粗体:**文字** 或 __文字__ +- 斜体:*文字* 或 _文字_ +- 行内代码:`代码` +- 代码块:```语言\\n代码\\n``` +- 删除线:~~文字~~ +- 链接:[链接文字](URL) +- 图片:![图片描述](图片路径) +- 无序列表:- 或 * 或 + +- 有序列表:1. 2. 3. +- 引用:> 引用内容 +- 表格:| 列1 | 列2 | +- 水平分隔线:--- 或 *** 或 ___ + +文字处理功能: +- 转换文字顺序:将文字内容进行特定转换处理 +- 错别字处理:可以按设定强度引入常见的错别字,用于测试或特殊用途 +- 标点符号替换:将句号转换为逗号,保留文末句号 + +输出路径选择: +- 输出到TXT文件所在文件夹: 每个DOCX文件会直接保存在对应TXT文件所在的文件夹中 +- 输出到指定文件夹: 所有DOCX文件会直接保存在您指定的文件夹中 + +匹配规则: +- 完全匹配: TXT文件名(不含扩展名)与图片文件夹名完全相同 +- 前缀匹配: 图片文件夹名以前缀形式包含TXT文件名 +- 包含匹配: 图片文件夹名中包含TXT文件名 + +转换规则: +- 每个小标题的第一段后会插入一张图片 +- 先将Markdown格式转换为DOCX格式,再处理文字内容 +- 支持文字顺序调换、错别字处理和标点符号替换功能 + +错别字处理说明: +- 错误强度:控制替换比例,0.0表示不替换,1.0表示替换所有可能的字 +- 错别字库:可自定义JSON格式的错别字映射文件 +- 常见映射:的↔地↔得、在↔再、是↔事等 +""" + sg.popup_scrolled('使用帮助', help_text, size=(70, 25)) + + +# 结果窗口 +def show_results_window(results): + """显示批量处理结果窗口""" + if results['failed'] == 0: + message = f"全部成功!\n共处理 {results['total']} 个文件,全部转换成功。" + if results['main_output_folder']: + message += f"\n主要输出文件夹: {results['main_output_folder']}" + sg.popup('处理完成', message) + else: + failed_text = "\n".join([f"- {item['name']}: {item['error']}" for item in results['failed_items']]) + message = (f"处理完成!\n共处理 {results['total']} 个文件," + f"{results['success']} 个成功,{results['failed']} 个失败。\n\n" + f"失败项:\n{failed_text}") + if results['main_output_folder']: + message += f"\n主要输出文件夹: {results['main_output_folder']}" + sg.popup_scrolled('处理完成', message, size=(60, 20)) + + # 询问是否打开输出文件夹 + if results['main_output_folder'] and os.path.exists(results['main_output_folder']): + if sg.popup_yes_no('是否打开主要输出文件夹?') == 'Yes': + if sys.platform.startswith('win'): + os.startfile(results['main_output_folder']) + elif sys.platform.startswith('darwin'): + os.system(f'open "{results["main_output_folder"]}"') + else: + os.system(f'xdg-open "{results["main_output_folder"]}"') + + +# 主界面 +def main_window(): + """主界面""" + sg.theme('BlueMono') + matched_pairs = [] + + layout = [ + [sg.Text('批量Markdown TXT转DOCX工具', font=('bold', 16))], + [sg.Text('(按文件名匹配TXT文件和图片文件夹,支持完整Markdown格式)', text_color='gray')], + [sg.HSeparator()], + [sg.Text('TXT文件文件夹:', size=(15, 1)), + sg.InputText(key='txt_folder', enable_events=True, default_text=config.last_txt_folder), + sg.FolderBrowse('浏览')], + [sg.Text('图片根文件夹:', size=(15, 1)), + sg.InputText(key='images_root', enable_events=True, default_text=config.last_images_root), + sg.FolderBrowse('浏览')], + [sg.Text('输出根文件夹:', size=(15, 1)), + sg.InputText(key='output_root', enable_events=True, default_text=config.last_output_root), + sg.FolderBrowse('浏览'), + sg.Text('(当选择"输出到指定文件夹"时有效)', text_color='gray')], + [sg.Button('扫描文件', size=(12, 1)), + sg.Button('编辑匹配', size=(12, 1), disabled=True), + sg.Button('转换设置', size=(12, 1)), + sg.Button('帮助', size=(8, 1))], + [sg.HSeparator()], + [sg.Text('匹配结果预览:', font=('bold', 10))], + [sg.Table( + values=[], + headings=['TXT文件名', '相对路径', '匹配的图片文件夹'], + key='-PREVIEW_TABLE-', + auto_size_columns=False, + col_widths=[20, 30, 30], + justification='left', + size=(None, 10) + )], + [sg.ProgressBar(100, orientation='h', size=(80, 20), key='progress_bar', visible=False)], + [sg.Text('状态: 就绪', key='status_text', size=(80, 1))], + [sg.Button('开始批量转换', size=(15, 1), disabled=True), sg.Button('退出')] + ] + + window = sg.Window('批量Markdown TXT转DOCX工具', layout, resizable=True) + progress_bar = window['progress_bar'] + status_text = window['status_text'] + preview_table = window['-PREVIEW_TABLE-'] + output_root_input = window['output_root'] + + def update_output_root_state(): + """根据配置更新输出根文件夹输入框的状态""" + if config.output_location == "custom": + output_root_input.update(disabled=False) + output_root_input.Widget.configure(foreground='black') + else: + output_root_input.update(disabled=True) + output_root_input.Widget.configure(foreground='gray') + + window.read(timeout=1) + update_output_root_state() + + while True: + event, values = window.read() + + if event in (sg.WIN_CLOSED, '退出'): + if values is not None: + config.last_txt_folder = values.get('txt_folder', '') + config.last_images_root = values.get('images_root', '') + config.last_output_root = values.get('output_root', '') + config.save_to_file(CONFIG_FILE_PATH) + break + + if event == '转换设置': + current_output_root = values['output_root'] + show_config_window() + update_output_root_state() + window['output_root'].update(current_output_root) + + if event == '帮助': + show_help_window() + + if event == '扫描文件': + txt_folder = values['txt_folder'] + images_root = values['images_root'] + + if not txt_folder: + sg.popup_error('请选择TXT文件所在的文件夹') + continue + + if not images_root: + sg.popup_error('请选择图片根文件夹') + continue + + config.last_txt_folder = txt_folder + config.last_images_root = images_root + if values['output_root']: + config.last_output_root = values['output_root'] + config.save_to_file(CONFIG_FILE_PATH) + + try: + status_text.update('正在扫描TXT文件...') + window.refresh() + txt_files = FileHandler.scan_txt_files(txt_folder) + + status_text.update('正在匹配图片文件夹...') + window.refresh() + matched_pairs = FileHandler.find_matching_image_folders(txt_files, images_root) + + table_data = [] + for pair in matched_pairs: + img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" + table_data.append([ + pair['txt']['name'], + pair['txt']['relative_path'], + img_folder + ]) + + preview_table.update(values=table_data) + status_text.update(f'扫描完成: 找到 {len(matched_pairs)} 个TXT文件') + + window['编辑匹配'].update(disabled=False) + window['开始批量转换'].update(disabled=False) + + except Exception as e: + sg.popup_error(f'扫描失败: {str(e)}') + status_text.update('状态: 扫描失败') + + if event == '编辑匹配' and matched_pairs: + images_root = values['images_root'] + if not images_root: + sg.popup_error('请选择图片根文件夹') + continue + + matched_pairs = show_matching_editor(matched_pairs, images_root) + + table_data = [] + for pair in matched_pairs: + img_folder = pair['image_folder']['relative_path'] if pair['image_folder'] else "无匹配" + table_data.append([ + pair['txt']['name'], + pair['txt']['relative_path'], + img_folder + ]) + + preview_table.update(values=table_data) + + if event == '开始批量转换' and matched_pairs: + if config.output_location == "custom" and not values['output_root']: + sg.popup_error('请选择输出根文件夹(在"转换设置"中选择了"输出到指定文件夹")') + continue + + try: + progress_bar.update(0, visible=True) + status_text.update('开始批量转换...') + window.refresh() + + def update_batch_progress(progress, text): + progress_bar.update(progress) + status_text.update(f'状态: {text}') + window.refresh() + + results = BatchProcessor.process_batch(matched_pairs, values['output_root'], update_batch_progress) + show_results_window(results) + status_text.update('状态: 批量转换完成') + + except Exception as e: + sg.popup_error(f'批量处理失败: {str(e)}') + status_text.update('状态: 批量转换失败') + finally: + progress_bar.update(0, visible=False) + + if (event == 'txt_folder' or event == 'images_root') and values[event] and not values['output_root']: + default_output = values['txt_folder'] if values['txt_folder'] else values['images_root'] + window['output_root'].update(default_output) + + window.close() + + +# 程序入口 +if __name__ == '__main__': + main_window() \ No newline at end of file diff --git a/data/error_chars.json b/data/error_chars.json new file mode 100644 index 0000000..9df266a --- /dev/null +++ b/data/error_chars.json @@ -0,0 +1,116 @@ +{ + "日": "曰", + "木": "本", + "度": "渡", + "暴": "爆", + "籍": "藉", + "销": "消", + "璧": "壁", + "讴": "呕", + "勠": "戮", + "篡": "纂", + "需": "须", + "迄": "讫", + "磬": "罄", + "驰": "弛", + "拨": "拔", + "朴": "扑", + "沾": "粘", + "戊": "戌", + "崇": "祟", + "菅": "管", + "荼": "茶", + "灸": "炙", + "钓": "钧", + "丐": "丏", + "亨": "享", + "赢": "羸", + "肓": "盲", + "赝": "膺", + "掣": "擎", + "峰": "锋", + "读": "续", + "眯": "咪", + "胶": "狡", + "旯": "旮", + "奄": "掩", + "恃": "持", + "径": "胫", + "坝": "狈", + "幅": "副", + "颗": "棵", + "即": "既", + "俩": "两", + "辨": "辩", + "树立": "竖立", + "其他": "其它", + "截止": "截至", + "考查": "考察", + "治服": "制服", + "权利": "权力", + "申明": "声明", + "交代": "交待", + "含义": "涵义", + "安": "按", + "曝": "暴", + "博": "搏", + "灿": "粲", + "毫": "豪", + "检": "捡", + "骄": "娇", + "梁": "粱", + "蓬": "篷", + "辟": "僻", + "欺": "期", + "洽": "恰", + + "皱": "邹", + "诸": "著", + "煮": "著", + "壮": "状", + "追": "摧", + "卓": "桌", + "咨": "资", + "滋": "磁", + "阻": "组", + "遵": "尊", + "的": "得", + "她": "他", + "到": "倒", + "倒": "到", + "要": "耍", + "说": "讲", + "讲": "说", + "做": "作", + "作": "做", + "已": "已", + "己": "已", + "以": "已", + "进": "近", + "近": "进", + "象": "像", + "像": "象", + "茶": "荼", + "孑": "孓", + "子": "孑", + "雎": "睢", + "汆": "氽", + "戍": "戌", + "妹": "妺", + "口": "囗", + "姬": "姫", + "祎": "袆", + "亳": "毫", + "汩": "汨", + "市": "巿", + "壸": "壶", + "祒": "袑", + "洗": "冼", + "夂": "夊", + "祖": "袓", + "芙": "褔", + "萬": "萭" +} + + + diff --git a/replacestr.py b/replacestr.py new file mode 100644 index 0000000..e35e174 --- /dev/null +++ b/replacestr.py @@ -0,0 +1,473 @@ +import re +import random +import argparse +import sys +import os +from typing import List, Tuple, Optional, Dict, Any +from pathlib import Path +import logging + + +class TextProcessor: + """文本处理器类,支持句子拆分和字符交换""" + + def __init__(self, min_length: int = 30, custom_punctuation: Optional[str] = None): + """ + 初始化文本处理器 + + Args: + min_length: 句子长度阈值 + custom_punctuation: 自定义标点符号,如果为None则使用默认标点 + """ + self.min_length = min_length + self.sentence_endings = custom_punctuation or r'[,。!?;?!;]' + self.statistics = { + 'total_sentences': 0, + 'processed_sentences': 0, + 'total_chars': 0, + 'swapped_chars': 0 + } + + # 设置日志 + logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') + self.logger = logging.getLogger(__name__) + + def split_sentences(self, text: str) -> List[Tuple[str, str]]: + """ + 按标点符号拆分句子,保留标点符号 + + Args: + text: 输入文本 + + Returns: + List[Tuple[str, str]]: 每个元组包含 (句子内容, 标点符号) + """ + if not text.strip(): + return [] + + # 使用正则表达式拆分,保留分隔符 + parts = re.split(f'({self.sentence_endings})', text) + + sentences = [] + i = 0 + while i < len(parts): + content = parts[i].strip() + if content: # 非空内容 + # 检查下一个部分是否是标点符号 + if i + 1 < len(parts) and re.match(self.sentence_endings, parts[i + 1]): + punctuation = parts[i + 1] + i += 2 + else: + punctuation = '' + i += 1 + sentences.append((content, punctuation)) + self.statistics['total_sentences'] += 1 + else: + i += 1 + + return sentences + + def swap_random_chars(self, sentence: str) -> str: + """ + 对超长句子随机交换相邻两个字符的顺序 + + Args: + sentence: 输入句子 + + Returns: + str: 处理后的句子 + """ + # 边界情况处理 + if not sentence or len(sentence) <= self.min_length or len(sentence) <= 3: + return sentence + + # 转换为字符列表便于操作 + chars = list(sentence) + original_length = len(chars) + + # 确定可交换的范围(避开首尾字符,且需要成对相邻) + # 对于长度为n的句子,可交换的相邻对位置为:(1,2), (2,3), ..., (n-3,n-2) + start_idx = 1 + end_idx = len(chars) - 3 # 最后一个可交换对的起始位置 + + if end_idx < start_idx: + return sentence + + try: + # 随机选择一个相邻对的起始位置 + swap_start = random.randint(start_idx, end_idx) + swap_end = swap_start + 1 + + # 交换相邻的两个字符 + chars[swap_start], chars[swap_end] = chars[swap_end], chars[swap_start] + + # 更新统计信息 + self.statistics['processed_sentences'] += 1 + self.statistics['swapped_chars'] += 2 + + self.logger.debug(f"交换相邻位置 {swap_start} 和 {swap_end},句子长度:{original_length}") + + except (ValueError, IndexError) as e: + self.logger.warning(f"字符交换失败:{e}") + return sentence + + return ''.join(chars) + + def process_text(self, text: str) -> str: + """ + 处理文本:拆分句子并对超长句子进行字符交换 + + Args: + text: 输入文本 + + Returns: + str: 处理后的文本 + """ + if not text: + return text + + # 重置统计信息 + self.statistics = { + 'total_sentences': 0, + 'processed_sentences': 0, + 'total_chars': len(text), + 'swapped_chars': 0 + } + + # 按段落分割 + paragraphs = text.split('\n') + processed_paragraphs = [] + + for paragraph in paragraphs: + if not paragraph.strip(): + processed_paragraphs.append(paragraph) + continue + + # 拆分句子 + sentences = self.split_sentences(paragraph) + + # 处理每个句子 + processed_sentences = [] + for sentence_content, punctuation in sentences: + # 对句子内容进行字符交换 + processed_content = self.swap_random_chars(sentence_content) + processed_sentences.append(processed_content + punctuation) + + # 重新组合句子 + processed_paragraph = ''.join(processed_sentences) + processed_paragraphs.append(processed_paragraph) + + return '\n'.join(processed_paragraphs) + + def get_statistics(self) -> Dict[str, Any]: + """获取处理统计信息""" + return self.statistics.copy() + + def print_statistics(self): + """打印处理统计信息""" + stats = self.get_statistics() + print("\n" + "=" * 50) + print("处理统计信息:") + print(f"总字符数:{stats['total_chars']}") + print(f"总句子数:{stats['total_sentences']}") + print(f"处理句子数:{stats['processed_sentences']}") + print(f"交换字符数:{stats['swapped_chars']}") + if stats['total_sentences'] > 0: + print(f"处理率:{stats['processed_sentences'] / stats['total_sentences'] * 100:.1f}%") + print("=" * 50) + + +class FileHandler: + """文件处理器,负责文件的读写操作""" + + @staticmethod + def read_file(filename: str) -> str: + """ + 读取文件内容,支持多种编码 + + Args: + filename: 文件路径 + + Returns: + str: 文件内容 + + Raises: + FileNotFoundError: 文件不存在 + PermissionError: 权限不足 + UnicodeDecodeError: 编码错误 + """ + if not os.path.exists(filename): + raise FileNotFoundError(f"文件 '{filename}' 不存在") + + if not os.access(filename, os.R_OK): + raise PermissionError(f"没有读取文件 '{filename}' 的权限") + + # 尝试多种编码格式 + encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1'] + + for encoding in encodings: + try: + with open(filename, 'r', encoding=encoding) as f: + content = f.read() + logging.info(f"使用 {encoding} 编码成功读取文件:{filename}") + return content + except UnicodeDecodeError: + continue + + raise UnicodeDecodeError(f"无法解码文件 '{filename}',尝试的编码格式:{encodings}") + + @staticmethod + def write_file(filename: str, content: str, encoding: str = 'utf-8') -> None: + """ + 写入文件内容 + + Args: + filename: 输出文件路径 + content: 要写入的内容 + encoding: 编码格式 + + Raises: + PermissionError: 权限不足 + OSError: 磁盘空间不足等系统错误 + """ + # 确保目录存在 + output_dir = os.path.dirname(filename) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + + try: + with open(filename, 'w', encoding=encoding) as f: + f.write(content) + logging.info(f"成功写入文件:{filename}") + except PermissionError: + raise PermissionError(f"没有写入文件 '{filename}' 的权限") + except OSError as e: + raise OSError(f"写入文件 '{filename}' 时发生错误:{e}") + + +def setup_argument_parser() -> argparse.ArgumentParser: + """设置命令行参数解析器""" + parser = argparse.ArgumentParser( + description='文本句子字符交换处理器', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +使用示例: + %(prog)s -f input.txt # 处理文件 + %(prog)s -t "你的文本内容" # 直接处理文本 + %(prog)s -f input.txt -l 20 # 设置长度阈值为20 + %(prog)s -f input.txt -o output.txt # 输出到文件 + %(prog)s -f input.txt -p "。!?" -s # 自定义标点符号并显示统计 + """ + ) + + # 输入选项 + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument('-f', '--file', help='输入文件路径') + input_group.add_argument('-t', '--text', help='直接输入文本') + input_group.add_argument('--stdin', action='store_true', + help='从标准输入读取文本') + + # 处理选项 + parser.add_argument('-l', '--length', type=int, default=30, + help='句子长度阈值(默认30)') + parser.add_argument('-p', '--punctuation', + help='自定义标点符号(默认:。!?;?!;)') + parser.add_argument('-o', '--output', help='输出文件路径') + parser.add_argument('-e', '--encoding', default='utf-8', + help='输出文件编码(默认utf-8)') + + # 其他选项 + parser.add_argument('-s', '--statistics', action='store_true', + help='显示处理统计信息') + parser.add_argument('-v', '--verbose', action='store_true', + help='显示详细日志') + parser.add_argument('--seed', type=int, help='随机数种子(用于测试)') + + return parser + + +def main(): + """主函数:处理命令行参数和文本处理""" + parser = setup_argument_parser() + args = parser.parse_args() + + # 设置日志级别 + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # 设置随机数种子(用于测试) + if args.seed: + random.seed(args.seed) + + # 获取输入文本 + try: + if args.file: + text = FileHandler.read_file(args.file) + elif args.text: + text = args.text + elif args.stdin: + text = sys.stdin.read() + else: + print("错误:请指定输入源") + sys.exit(1) + + if not text.strip(): + print("警告:输入文本为空") + sys.exit(0) + + except (FileNotFoundError, PermissionError, UnicodeDecodeError) as e: + print(f"错误:{e}") + sys.exit(1) + + # 创建处理器并处理文本 + try: + processor = TextProcessor( + min_length=args.length, + custom_punctuation=args.punctuation + ) + + processed_text = processor.process_text(text) + + # 输出结果 + if args.output: + FileHandler.write_file(args.output, processed_text, args.encoding) + print(f"处理完成,结果已保存到 '{args.output}'") + else: + print("处理结果:") + print("-" * 50) + print(processed_text) + + # 显示统计信息 + if args.statistics: + processor.print_statistics() + + except Exception as e: + print(f"处理过程中发生错误:{e}") + if args.verbose: + import traceback + traceback.print_exc() + sys.exit(1) + + +# 单元测试 +def run_tests(): + """运行基本的单元测试""" + print("运行单元测试...") + + # 测试句子拆分 + processor = TextProcessor(min_length=6) + + # 测试1:普通句子拆分 + test_text = "这是第一句。这是第二句!第三句?" + sentences = processor.split_sentences(test_text) + assert len(sentences) == 3, f"期望3个句子,实际{len(sentences)}个" + assert sentences[0] == ("这是第一句", "。"), f"第一句解析错误:{sentences[0]}" + + # 测试2:相邻字符交换 + long_sentence = "这是一个很长的句子用来测试字符交换功能" + random.seed(42) # 固定种子以便测试 + result = processor.swap_random_chars(long_sentence) + assert result != long_sentence, "长句子应该被修改" + assert len(result) == len(long_sentence), "交换后长度应该不变" + + # 验证只交换了相邻的两个字符 + diff_count = sum(1 for i, (a, b) in enumerate(zip(long_sentence, result)) if a != b) + assert diff_count == 2, f"应该只有2个字符位置发生变化,实际{diff_count}个" + + # 测试3:短句子不变 + short_sentence = "短句" + result = processor.swap_random_chars(short_sentence) + assert result == short_sentence, "短句子不应该被修改" + + # 测试4:边界情况 + empty_result = processor.swap_random_chars("") + assert empty_result == "", "空字符串应该保持不变" + + print("✓ 所有测试通过!") + + +# 示例使用 +def replace_text(text): + # 检查是否运行测试 + if len(sys.argv) > 1 and sys.argv[1] == 'test': + run_tests() + sys.exit(0) + + # 命令行模式 + if len(sys.argv) > 1: + main() + else: + # 示例演示 + sample_text = text + + print("示例演示:") + print("原文:") + print(sample_text) + print("\n" + "=" * 50 + "\n") + min_length = 12 + processor = TextProcessor(min_length) + processed = processor.process_text(sample_text) + print("处理后:") + print(processed) + + processor.print_statistics() + + + print("\n使用说明:") + print("命令行用法:") + print(" python script.py -f input.txt # 处理文件") + print(" python script.py -t '你的文本内容' # 直接处理文本") + print(" python script.py -f input.txt -l 20 # 设置长度阈值为20") + print(" python script.py -f input.txt -o output.txt # 输出到文件") + print(" python script.py -f input.txt -p '。!?' -s # 自定义标点符号并显示统计") + print(" python script.py test # 运行单元测试") + + return processed + + + +text = """阅读此文之前,麻烦您点击一下“关注”,既方便您进行讨论和分享,又能给您带来不一样的参与感,创作不易,感谢您的支持。 + +曾经“半路出家”,如今黯然无声,他的故事值得一品 +说起央视的主持人,大家第一反应肯定是一个个字正腔圆、形象出彩的脸孔。可是在这其中,有一位却用浓厚的潮汕口音,还有点“油滑”的幽默,自成一派。他就是阿丘。 + +这个名字,可能在现在已经鲜有人提起,但在过去,他可是实打实的“名嘴”。不过,咱来说点耐人寻味的,他是怎么走到央视巅峰,又怎么“高台跳水”的?这故事,够扎心,更够意味深长。 + +看似格格不入,却杀出重围 +熟悉阿丘的人,一听他那口音,就知道这是“岭南口音模块”的标配。他是个土生土长的广东人,也因为家里是军人家庭,小时候经常搬家,成了个活脱脱的语言天才,学会了好几个地方的方言。什么潮汕话、粤语、客家话,信手拈来。不得不说,小时候到处跑打下的基础,倒给他多了一点和别人不一样的“人味儿”。 + +他大学学的专业,可和主持半毛钱关系没有,是经济学。毕业后,他分配到了南宁的棉纺印染厂,待遇不错,是个政工干部。这时候的阿丘,怎么看都是个稳稳的职场小白,可谁能想到,后来的他能走上舞台呢? + +90年代,相声、小品各类幽默比赛风靡全国。阿丘平时最爱的就是琢磨这些妙语段子,一心觉得自己是个“未被发现的宝藏男孩”。机会来了,1992年,他参加广西举办的笑星大赛,居然拿了个一等奖。这下可出名了,厂里人都认识他,他本人也成了“地方笑星”。 + +再后来,他调到了广西电视台,开始主持节目。头几年波澜不惊,直到他参加《南北笑星火辣辣》,凭借风趣和机灵吸引了更多目光。2003年,这个来自地方台的主持人,直接杀进了央视主持圈。靠什么?靠他的个性和风格。 + +从风光无限到画风突变 +阿丘进入央视后,主持了好几档节目。他的幽默和接地气,与当时一板一眼的正规主持人大不相同。因此,他迅速被贴上“个性主持”的标签。尤其是在《社会记录》里,那带点潮汕腔调的问句,竟成了一种标志。 + +可惜,说话爽快的他,也因为“不当言论”栽了跟头。事情发生在2020年,正是全国上下齐心合力抗击疫情的时候。阿丘不知道怎么回事,在自己的博客里发了一些让人难以接受的言论。里头什么“东亚病夫”“道歉”显得格外刺眼。 + +不得不说,这一锅凉水泼得够彻底。网友立刻开始深挖,一挖还真揭出不少黑历史。有人爆料,他婚内包养女大学生,还试图给实习机会。虽然阿丘本人否认得七七八八,但这些传闻和再度破裂的婚姻,难免让人联想。 + +面对铺天盖地的指责,阿丘的态度是硬得离谱,一句道歉都没有。“嘴皮子”在这时候完全失灵了。要说在镜头前笑侃万事的大叔,这一次是真没能站住脚。 + +离开央视后的低调生活 +最后,阿丘与央视长达12年的缘分彻底告一段落。此后的日子,他也算是从公众视线中消失了。最让人记得的,是他两年后现身老搭档张泉灵的节目,只是,这一次,他的亮相显得缥缈又散淡。 + +如今阿丘的身份,更多转向了自媒体。开了个叫“阿丘观山”的账号,做起旅游文化博主。视频里,他介绍名山大川,什么五台山、武当山,天天讲人生感悟。这画风,和过去主持访谈节目的他,可真是差太远了。 + +不少老观众打开他的账号,可能都得感叹一声“物是人非”。更有网友直言,他的语气里听到了些许“悔意”,又觉得是假装云淡风轻,实际还是难以摆脱舆论的阴影。 + +留下的启示和争议 +阿丘的故事,是难得一见的。从地方电视台到央视舞台,他用12年时间登上顶峰,却因为12个字毁了前程。这起伏,真像一出大戏。 + +咱们反思一下,也许有些人,天赋、机遇都抓得很精准,但言行失当,永远是会砸场子的导火索。阿丘的人生轨迹,正说明了这一点。 + +现在问题来了,大家怎么看阿丘这个人?你是觉得他个性可惜,还是自毁前程? + +欢迎留言讨论,你们的每一次互动,都是创作的动力。""" + + +result = replace_text(text) +print(result) \ No newline at end of file